# Q-Q Plot
# install.packages("ggpubr")
library(ggpubr)
## Loading required package: ggplot2
# Post Hoc Test (dunnTest)
# install.packages("FSA")
library(FSA)
## ## FSA v0.8.30. See citation('FSA') if used in publication.
## ## Run fishR() for related website and fishR('IFAR') for related book.
# Stepwise Regression
# install.packages('MASS')
library(MASS)

# Residual Skewness
# install.packages("moments")
library(moments)

STEP 1: Data Set Preparation

Raw Data Set September 2009

  • Load data from September 2009
raw.data.09 <- read.csv("C:/Users/alber/Desktop/Ryerson/6. Spring-Summer 2020/Datasets Original/pub0909.csv")
head(raw.data.09)
##   REC_NUM SURVYEAR SURVMNTH LFSSTAT PROV CMA AGE_12 AGE_6 SEX MARSTAT ED76to89
## 1       1     2009        9       6   35   4     10    NA   2       1       NA
## 2       2     2009        9       6   48   4     12    NA   1       6       NA
## 3       3     2009        9       6   35   4     12    NA   1       1       NA
## 4       4     2009        9       1   46   4      4    NA   1       6       NA
## 5       5     2009        9       1   35   4      8    NA   2       1       NA
## 6       6     2009        9       6   35   4      9    NA   2       1       NA
##   EDUC90 MJH EVERWORK FTPTLAST COWMAIN FILLER1 FILLER2 NAICS_18 NAICS_43
## 1      2  NA        1        1       2      NA      NA       11       34
## 2      1  NA        2       NA      NA      NA      NA       NA       NA
## 3      2  NA        2       NA      NA      NA      NA       NA       NA
## 4      4   1       NA       NA       2      NA      NA       17       40
## 5      4   1       NA       NA       2      NA      NA        8       27
## 6      1  NA        2       NA      NA      NA      NA       NA       NA
##   SOC80_49 SOC80_21 NOCS_01_25 NOCS_01_47 YABSENT WKSAWAY PAYAWAY UHRSMAIN
## 1       NA       NA          6         12      NA      NA      NA       NA
## 2       NA       NA         NA         NA      NA      NA      NA       NA
## 3       NA       NA         NA         NA      NA      NA      NA       NA
## 4       NA       NA         17         31      NA      NA      NA      450
## 5       NA       NA          5         10      NA      NA      NA      375
## 6       NA       NA         NA         NA      NA      NA      NA       NA
##   AHRSMAIN FTPTMAIN UTOTHRS ATOTHRS HRSAWAY YAWAY PAIDOT UNPAIDOT XTRAHRS
## 1       NA       NA      NA      NA      NA    NA     NA       NA      NA
## 2       NA       NA      NA      NA      NA    NA     NA       NA      NA
## 3       NA       NA      NA      NA      NA    NA     NA       NA      NA
## 4      450        1     450     450       0    NA      0        0       0
## 5      375        1     375     375       0    NA      0        0       0
## 6       NA       NA      NA      NA      NA    NA     NA       NA      NA
##   WHYPTOLD WHYPTNEW TENURE PREVTEN HRLYEARN UNION PERMTEMP ESTSIZE FIRMSIZE
## 1       NA       NA     NA     240       NA    NA       NA      NA       NA
## 2       NA       NA     NA      NA       NA    NA       NA      NA       NA
## 3       NA       NA     NA      NA       NA    NA       NA      NA       NA
## 4       NA       NA    105      NA     2564     3        1       2        2
## 5       NA       NA     14      NA     1949     3        1       2        2
## 6       NA       NA     NA      NA       NA    NA       NA      NA       NA
##   DURUNEMP FLOWUNEM UNEMFTPT WHYLEFTO WHYLEFTN DURJLESS AVAILABL LKPUBAG
## 1       NA       NA       NA        5        7        3       NA      NA
## 2       NA       NA       NA       NA       NA      225       NA      NA
## 3       NA       NA       NA       NA       NA       88       NA      NA
## 4       NA       NA       NA       NA       NA       NA       NA      NA
## 5       NA       NA       NA       NA       NA       NA       NA      NA
## 6       NA       NA       NA       NA       NA       36       NA      NA
##   LKEMPLOY LKRELS LKATADS LKANSADS LKOTHERN PRIORACT YNOLKOLD YNOLOOK TLOLOOK
## 1       NA     NA      NA       NA       NA       NA       NA      NA      NA
## 2       NA     NA      NA       NA       NA       NA       NA      NA      NA
## 3       NA     NA      NA       NA       NA       NA       NA      NA      NA
## 4       NA     NA      NA       NA       NA       NA       NA      NA      NA
## 5       NA     NA      NA       NA       NA       NA       NA      NA      NA
## 6       NA     NA      NA       NA       NA       NA       NA      NA      NA
##   SCHOOLN RELREFN EFAMTYPE EFAMSIZE EFAMEMPL EFAMUNEM SP_AGE7 SP_LFSST SPED7689
## 1       1       2        5        2        1        0       6        2       NA
## 2      NA       1        1        1        0        0      NA       NA       NA
## 3      NA       1        8        2        1        0       6        2       NA
## 4       1       1        1        1        1        0      NA       NA       NA
## 5       1       1        2        2        2        0       5        1       NA
## 6       1       1        5        2        1        0       7        1       NA
##   SPED1990 SP_SOC80 SP_NOCS01 SP_UHRSM SP_UHRST SP_COWM AGYOWNKN SCH1624
## 1        3       NA        12        2        2       2       NA      NA
## 2       NA       NA        NA       NA       NA      NA       NA      NA
## 3        2       NA        14        1        1       2       NA      NA
## 4       NA       NA        NA       NA       NA      NA       NA      NA
## 5        1       NA         5        4        4       2       NA      NA
## 6        1       NA        21        5        5       2       NA      NA
##   FINALWT
## 1     181
## 2     424
## 3     676
## 4      90
## 5     415
## 6     160
nrow(raw.data.09)
## [1] 107593
  • Basic stats of Labor Force Status attribute
summary(raw.data.09$LFSSTAT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   2.925   6.000   6.000
sum(is.na(raw.data.09$LFSSTAT))
## [1] 0
str(raw.data.09$LFSSTAT)
##  int [1:107593] 6 6 6 1 1 6 6 6 6 1 ...
  • Select right observations
# Only Employed Data
data.09 <- as.data.frame(raw.data.09[raw.data.09$LFSSTAT < 3,])

# Only Public and Private Sector Employees Data
data.09 <- as.data.frame(data.09[data.09$COWMAIN < 3,])
  • Match values to 2019
# OCCUPATION Variable to match 2019's NOC_10
data.09$NOC_10 <- data.09$NOCS_01_25
sort(unique(data.09$NOC_10))
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
table(data.09$NOC_10)
## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
##  255 3332 1210 2720 5892 3380 1617 2306 2465 2678  952 1223 4182 2187  982  820 
##   17   18   19   20   21   22   23   24   25 
## 5184  638 1233 3186 2307 1402 1581 2309  516
data.09$NOC_10[data.09$NOC_10 == 2] <- 1
data.09$NOC_10[data.09$NOC_10 %in% (3:5)] <- 2
data.09$NOC_10[data.09$NOC_10 == 6] <- 3
data.09$NOC_10[data.09$NOC_10 %in% (7:8)] <- 4
data.09$NOC_10[data.09$NOC_10 %in% (9:10)] <- 5
data.09$NOC_10[data.09$NOC_10 == 11] <- 6
data.09$NOC_10[data.09$NOC_10 %in% (12:17)] <- 7
data.09$NOC_10[data.09$NOC_10 %in% (18:22)] <- 8
data.09$NOC_10[data.09$NOC_10 == 23] <- 9
data.09$NOC_10[data.09$NOC_10 %in% (24:25)] <- 10
sort(unique(data.09$NOC_10))
##  [1]  1  2  3  4  5  6  7  8  9 10
table(data.09$NOC_10)
## 
##     1     2     3     4     5     6     7     8     9    10 
##  3587  9822  3380  3923  5143   952 14578  8766  1581  2825
# AGE OF YOUNGEST CHILD
sort(unique(data.09$AGYOWNKN))
## [1] 1 2 3 4 5 6
table(data.09$AGYOWNKN)
## 
##    1    2    3    4    5    6 
## 4257 2693 6057 2863 1930 3640
data.09$AGYOWNKN[data.09$AGYOWNKN == 2] <- 1
data.09$AGYOWNKN[data.09$AGYOWNKN == 3] <- 2
data.09$AGYOWNKN[data.09$AGYOWNKN %in% (4:5)] <- 3
data.09$AGYOWNKN[data.09$AGYOWNKN == 6] <- 4
sort(unique(data.09$AGYOWNKN))
## [1] 1 2 3 4
table(data.09$AGYOWNKN)
## 
##    1    2    3    4 
## 6950 6057 4793 3640
# CURRENT STUDENT STATUS
sort(unique(data.09$SCHOOLN))
## [1] 1 2 3 4 5 6 7 8 9
table(data.09$SCHOOLN)
## 
##     1     2     3     4     5     6     7     8     9 
## 48304  1587   110  1363   654   908   328    88   223
data.09$SCHOOLN[data.09$SCHOOLN %in% seq(from = 2, to = 8, by = 2)] <- 2
data.09$SCHOOLN[data.09$SCHOOLN %in% seq(from = 3, to = 9, by = 2)] <- 3
sort(unique(data.09$SCHOOLN))
## [1] 1 2 3
table(data.09$SCHOOLN)
## 
##     1     2     3 
## 48304  3946  1315
  • Remove variables
# Remove Out of Dates Variables
names(data.09)
##  [1] "REC_NUM"    "SURVYEAR"   "SURVMNTH"   "LFSSTAT"    "PROV"      
##  [6] "CMA"        "AGE_12"     "AGE_6"      "SEX"        "MARSTAT"   
## [11] "ED76to89"   "EDUC90"     "MJH"        "EVERWORK"   "FTPTLAST"  
## [16] "COWMAIN"    "FILLER1"    "FILLER2"    "NAICS_18"   "NAICS_43"  
## [21] "SOC80_49"   "SOC80_21"   "NOCS_01_25" "NOCS_01_47" "YABSENT"   
## [26] "WKSAWAY"    "PAYAWAY"    "UHRSMAIN"   "AHRSMAIN"   "FTPTMAIN"  
## [31] "UTOTHRS"    "ATOTHRS"    "HRSAWAY"    "YAWAY"      "PAIDOT"    
## [36] "UNPAIDOT"   "XTRAHRS"    "WHYPTOLD"   "WHYPTNEW"   "TENURE"    
## [41] "PREVTEN"    "HRLYEARN"   "UNION"      "PERMTEMP"   "ESTSIZE"   
## [46] "FIRMSIZE"   "DURUNEMP"   "FLOWUNEM"   "UNEMFTPT"   "WHYLEFTO"  
## [51] "WHYLEFTN"   "DURJLESS"   "AVAILABL"   "LKPUBAG"    "LKEMPLOY"  
## [56] "LKRELS"     "LKATADS"    "LKANSADS"   "LKOTHERN"   "PRIORACT"  
## [61] "YNOLKOLD"   "YNOLOOK"    "TLOLOOK"    "SCHOOLN"    "RELREFN"   
## [66] "EFAMTYPE"   "EFAMSIZE"   "EFAMEMPL"   "EFAMUNEM"   "SP_AGE7"   
## [71] "SP_LFSST"   "SPED7689"   "SPED1990"   "SP_SOC80"   "SP_NOCS01" 
## [76] "SP_UHRSM"   "SP_UHRST"   "SP_COWM"    "AGYOWNKN"   "SCH1624"   
## [81] "FINALWT"    "NOC_10"
data.09 <- subset(data.09, select = -c(ED76to89
                   , NAICS_43
                   , SOC80_49
                   , SOC80_21
                   , NOCS_01_25
                   , NOCS_01_47
                   , WHYPTOLD
                   , YNOLKOLD
                   , RELREFN
                   , EFAMSIZE
                   , EFAMEMPL
                   , EFAMUNEM
                   , SP_AGE7
                   , SP_LFSST
                   , SPED7689
                   , SPED1990
                   , SP_SOC80
                   , SP_NOCS01
                   , SP_UHRSM
                   , SP_UHRST
                   , SP_COWM
                   , SCH1624))
names(data.09)
##  [1] "REC_NUM"  "SURVYEAR" "SURVMNTH" "LFSSTAT"  "PROV"     "CMA"     
##  [7] "AGE_12"   "AGE_6"    "SEX"      "MARSTAT"  "EDUC90"   "MJH"     
## [13] "EVERWORK" "FTPTLAST" "COWMAIN"  "FILLER1"  "FILLER2"  "NAICS_18"
## [19] "YABSENT"  "WKSAWAY"  "PAYAWAY"  "UHRSMAIN" "AHRSMAIN" "FTPTMAIN"
## [25] "UTOTHRS"  "ATOTHRS"  "HRSAWAY"  "YAWAY"    "PAIDOT"   "UNPAIDOT"
## [31] "XTRAHRS"  "WHYPTNEW" "TENURE"   "PREVTEN"  "HRLYEARN" "UNION"   
## [37] "PERMTEMP" "ESTSIZE"  "FIRMSIZE" "DURUNEMP" "FLOWUNEM" "UNEMFTPT"
## [43] "WHYLEFTO" "WHYLEFTN" "DURJLESS" "AVAILABL" "LKPUBAG"  "LKEMPLOY"
## [49] "LKRELS"   "LKATADS"  "LKANSADS" "LKOTHERN" "PRIORACT" "YNOLOOK" 
## [55] "TLOLOOK"  "SCHOOLN"  "EFAMTYPE" "AGYOWNKN" "FINALWT"  "NOC_10"
# Remove Unemployment and other Variables specified on main report
data.09 <- subset(data.09, select = -c(AGE_6
                                       , EVERWORK
                                       , FTPTLAST
                                       , YABSENT
                                       , WKSAWAY
                                       , PAYAWAY
                                       , UHRSMAIN
                                       , AHRSMAIN
                                       , ATOTHRS
                                       , HRSAWAY
                                       , YAWAY
                                       , PAIDOT
                                       , UNPAIDOT
                                       , XTRAHRS
                                       , WHYPTNEW
                                       , PREVTEN
                                       , DURUNEMP
                                       , FLOWUNEM
                                       , UNEMFTPT
                                       , WHYLEFTO
                                       , WHYLEFTN
                                       , DURJLESS
                                       , AVAILABL
                                       , LKPUBAG
                                       , LKEMPLOY
                                       , LKRELS
                                       , LKATADS
                                       , LKANSADS
                                       , LKOTHERN
                                       , PRIORACT
                                       , YNOLOOK
                                       , TLOLOOK
                                       , FINALWT))
  • Rename attributes and change column order to match 2019
# Rename Variables
names(data.09)[names(data.09) == "EDUC90"] <- "EDUC"
names(data.09)[names(data.09) == "FILLER1"] <- "IMMIG"
names(data.09)[names(data.09) == "FILLER2"] <- "NOC_40"
names(data.09)[names(data.09) == "WHYPTNEW"] <- "WHYPT"
names(data.09)[names(data.09) == "AGYOWNKN"] <- "AGYOWNK"

# Move variables NOC_10, NAICS_18
noc10.idx <-  grep("NOC_10", names(data.09))
naics18.idx <-  grep("NAICS_18", names(data.09))
data.09 <- data.09[, c(1:13, naics18.idx, noc10.idx, 14, 16:(ncol(data.09)-1))]
names(data.09)
##  [1] "REC_NUM"  "SURVYEAR" "SURVMNTH" "LFSSTAT"  "PROV"     "CMA"     
##  [7] "AGE_12"   "SEX"      "MARSTAT"  "EDUC"     "MJH"      "COWMAIN" 
## [13] "IMMIG"    "NAICS_18" "NOC_10"   "NOC_40"   "FTPTMAIN" "UTOTHRS" 
## [19] "TENURE"   "HRLYEARN" "UNION"    "PERMTEMP" "ESTSIZE"  "FIRMSIZE"
## [25] "SCHOOLN"  "EFAMTYPE" "AGYOWNK"

Raw Data Set September 2019

  • Load data from September 2019
raw.data.19 <- read.csv("C:/Users/alber/Desktop/Ryerson/6. Spring-Summer 2020/Datasets Original/pub0919.csv")
head(raw.data.19)
##   REC_NUM SURVYEAR SURVMNTH LFSSTAT PROV CMA AGE_12 AGE_6 SEX MARSTAT EDUC MJH
## 1       1     2019        9       2   35   0      7    NA   2       2    4   1
## 2       2     2019        9       1   59   0      4    NA   1       6    4   1
## 3       3     2019        9       1   59   9      3     6   2       6    6   1
## 4       4     2019        9       4   35   0      9    NA   2       1    2  NA
## 5       5     2019        9       1   24   0      6    NA   1       2    4   1
## 6       6     2019        9       1   35   3      8    NA   2       1    4   1
##   EVERWORK FTPTLAST COWMAIN IMMIG NAICS_21 NOC_10 NOC_40 YABSENT WKSAWAY
## 1       NA       NA       2     3       19      7     26       3       2
## 2       NA       NA       2     3       20      2      5      NA      NA
## 3       NA       NA       2     3       19      7     24      NA      NA
## 4        1        2      NA     3       NA     NA     NA      NA      NA
## 5       NA       NA       5     3        1      1      4      NA      NA
## 6       NA       NA       1     3        5      7     26      NA      NA
##   PAYAWAY UHRSMAIN AHRSMAIN FTPTMAIN UTOTHRS ATOTHRS HRSAWAY YAWAY PAIDOT
## 1       2      250        0        2     250       0      NA    NA     NA
## 2      NA      400      240        1     400     240     160     3      0
## 3      NA      400      400        1     400     400       0    NA      0
## 4      NA       NA       NA       NA      NA      NA      NA    NA     NA
## 5      NA      700      700        1     700     700      NA    NA     NA
## 6      NA      363      363        1     363     363       0    NA      0
##   UNPAIDOT XTRAHRS WHYPT TENURE PREVTEN HRLYEARN UNION PERMTEMP ESTSIZE
## 1       NA      NA     7     30      NA     2500     3        1       2
## 2        0       0    NA     35      NA     6346     3        1       1
## 3        0       0    NA     53      NA     2450     3        1       1
## 4       NA      NA    NA     NA      NA       NA    NA       NA      NA
## 5       NA      NA    NA     72      NA       NA    NA       NA      NA
## 6        0       0    NA    240      NA     4000     1        1       4
##   FIRMSIZE DURUNEMP FLOWUNEM UNEMFTPT WHYLEFTO WHYLEFTN DURJLESS AVAILABL
## 1        4       NA       NA       NA       NA       NA       NA       NA
## 2        4       NA       NA       NA       NA       NA       NA       NA
## 3        1       NA       NA       NA       NA       NA       NA       NA
## 4       NA       NA       NA       NA        1        1       11       NA
## 5       NA       NA       NA       NA       NA       NA       NA       NA
## 6        4       NA       NA       NA       NA       NA       NA       NA
##   LKPUBAG LKEMPLOY LKRELS LKATADS LKANSADS LKOTHERN PRIORACT YNOLOOK TLOLOOK
## 1      NA       NA     NA      NA       NA       NA       NA      NA      NA
## 2      NA       NA     NA      NA       NA       NA       NA      NA      NA
## 3      NA       NA     NA      NA       NA       NA       NA      NA      NA
## 4      NA       NA     NA      NA       NA       NA       NA      NA      NA
## 5      NA       NA     NA      NA       NA       NA       NA      NA      NA
## 6      NA       NA     NA      NA       NA       NA       NA      NA      NA
##   SCHOOLN EFAMTYPE AGYOWNK FINALWT
## 1       1        2      NA     217
## 2       1        1      NA     302
## 3       1        1      NA     195
## 4       1       11      NA     231
## 5       1        3       2      79
## 6       1        3       3     640
nrow(raw.data.19)
## [1] 100011
  • Basic stats of Labor Force Status attribute
summary(raw.data.19$LFSSTAT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   2.208   4.000   4.000
sum(is.na(raw.data.19$LFSSTAT))
## [1] 0
str(raw.data.19$LFSSTAT)
##  int [1:100011] 2 1 1 4 1 1 4 1 1 4 ...
  • Select right observations
# Only Employed
data.19 <- as.data.frame(raw.data.19[raw.data.19$LFSSTAT < 3,])

# Only Public and Private Sector Employees
data.19 <- as.data.frame(data.19[data.19$COWMAIN < 3,])
  • Match values to 2019
# CMA
sort(unique(data.19$CMA))
##  [1] 0 1 2 3 4 5 6 7 8 9
table(data.19$CMA)
## 
##     0     1     2     3     4     5     6     7     8     9 
## 36367   824  2044   848  2974   757  2589  1356  1248  1924
data.19$CMA[data.19$CMA == 2] <- 10
data.19$CMA[data.19$CMA == 4] <- 2
data.19$CMA[data.19$CMA == 9] <- 30
data.19$CMA[data.19$CMA %in% c(1,3,5,6,7,8,0)] <- 4
data.19$CMA[data.19$CMA == 10] <- 1
data.19$CMA[data.19$CMA == 30] <- 3
sort(unique(data.19$CMA))
## [1] 1 2 3 4
table(data.19$CMA)
## 
##     1     2     3     4 
##  2044  2974  1924 43989
# NAICS_21 TO NAICS_18 (2009)
sort(unique(data.19$NAICS_21))
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21
table(data.19$NAICS_21)
## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
##  709  202   59 1220  495 3746 2736 2308 1590 6125 2559 1863  643 2494 1621 4465 
##   17   18   19   20   21 
## 7432 1858 3540 1716 3550
data.19$NAICS_21[data.19$NAICS_21 %in% (2:4)] <- 2
data.19$NAICS_21[data.19$NAICS_21 == 5] <- 3
data.19$NAICS_21[data.19$NAICS_21 == 6] <- 4
data.19$NAICS_21[data.19$NAICS_21 == 7] <- 5
data.19$NAICS_21[data.19$NAICS_21 == 8] <- 6
data.19$NAICS_21[data.19$NAICS_21 == 9] <- 7
data.19$NAICS_21[data.19$NAICS_21 == 10] <- 8
data.19$NAICS_21[data.19$NAICS_21 == 11] <- 9
data.19$NAICS_21[data.19$NAICS_21 %in% (12:13)] <- 10
data.19$NAICS_21[data.19$NAICS_21 == 14] <- 11
data.19$NAICS_21[data.19$NAICS_21 == 15] <- 12
data.19$NAICS_21[data.19$NAICS_21 == 16] <- 13
data.19$NAICS_21[data.19$NAICS_21 == 17] <- 14
data.19$NAICS_21[data.19$NAICS_21 == 18] <- 15
data.19$NAICS_21[data.19$NAICS_21 == 19] <- 16
data.19$NAICS_21[data.19$NAICS_21 == 20] <- 17
data.19$NAICS_21[data.19$NAICS_21 == 21] <- 18
names(data.19)[names(data.19) == "NAICS_21"] <- "NAICS_18"
sort(unique(data.19$NAICS_18))
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18
table(data.19$NAICS_18)
## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
##  709 1481  495 3746 2736 2308 1590 6125 2559 2506 2494 1621 4465 7432 1858 3540 
##   17   18 
## 1716 3550
  • Remove variables
# Remove Unemployment and other Variables specified on main report
names(data.19)
##  [1] "REC_NUM"  "SURVYEAR" "SURVMNTH" "LFSSTAT"  "PROV"     "CMA"     
##  [7] "AGE_12"   "AGE_6"    "SEX"      "MARSTAT"  "EDUC"     "MJH"     
## [13] "EVERWORK" "FTPTLAST" "COWMAIN"  "IMMIG"    "NAICS_18" "NOC_10"  
## [19] "NOC_40"   "YABSENT"  "WKSAWAY"  "PAYAWAY"  "UHRSMAIN" "AHRSMAIN"
## [25] "FTPTMAIN" "UTOTHRS"  "ATOTHRS"  "HRSAWAY"  "YAWAY"    "PAIDOT"  
## [31] "UNPAIDOT" "XTRAHRS"  "WHYPT"    "TENURE"   "PREVTEN"  "HRLYEARN"
## [37] "UNION"    "PERMTEMP" "ESTSIZE"  "FIRMSIZE" "DURUNEMP" "FLOWUNEM"
## [43] "UNEMFTPT" "WHYLEFTO" "WHYLEFTN" "DURJLESS" "AVAILABL" "LKPUBAG" 
## [49] "LKEMPLOY" "LKRELS"   "LKATADS"  "LKANSADS" "LKOTHERN" "PRIORACT"
## [55] "YNOLOOK"  "TLOLOOK"  "SCHOOLN"  "EFAMTYPE" "AGYOWNK"  "FINALWT"
data.19 <- subset(data.19, select = -c(AGE_6
                                       , EVERWORK
                                       , FTPTLAST
                                       , YABSENT
                                       , WKSAWAY
                                       , PAYAWAY
                                       , UHRSMAIN
                                       , AHRSMAIN
                                       , ATOTHRS
                                       , HRSAWAY
                                       , YAWAY
                                       , PAIDOT
                                       , UNPAIDOT
                                       , XTRAHRS
                                       , WHYPT
                                       , PREVTEN
                                       , DURUNEMP
                                       , FLOWUNEM
                                       , UNEMFTPT
                                       , WHYLEFTO
                                       , WHYLEFTN
                                       , DURJLESS
                                       , AVAILABL
                                       , LKPUBAG
                                       , LKEMPLOY
                                       , LKRELS
                                       , LKATADS
                                       , LKANSADS
                                       , LKOTHERN 
                                       , PRIORACT
                                       , YNOLOOK
                                       , TLOLOOK
                                       , FINALWT))                      

# Check both data sets have the same variables
names(data.19)
##  [1] "REC_NUM"  "SURVYEAR" "SURVMNTH" "LFSSTAT"  "PROV"     "CMA"     
##  [7] "AGE_12"   "SEX"      "MARSTAT"  "EDUC"     "MJH"      "COWMAIN" 
## [13] "IMMIG"    "NAICS_18" "NOC_10"   "NOC_40"   "FTPTMAIN" "UTOTHRS" 
## [19] "TENURE"   "HRLYEARN" "UNION"    "PERMTEMP" "ESTSIZE"  "FIRMSIZE"
## [25] "SCHOOLN"  "EFAMTYPE" "AGYOWNK"
names(data.09)
##  [1] "REC_NUM"  "SURVYEAR" "SURVMNTH" "LFSSTAT"  "PROV"     "CMA"     
##  [7] "AGE_12"   "SEX"      "MARSTAT"  "EDUC"     "MJH"      "COWMAIN" 
## [13] "IMMIG"    "NAICS_18" "NOC_10"   "NOC_40"   "FTPTMAIN" "UTOTHRS" 
## [19] "TENURE"   "HRLYEARN" "UNION"    "PERMTEMP" "ESTSIZE"  "FIRMSIZE"
## [25] "SCHOOLN"  "EFAMTYPE" "AGYOWNK"

Union of 2 data sets into one

data.all <- rbind(data.09, data.19)
str(data.all)
## 'data.frame':    105488 obs. of  27 variables:
##  $ REC_NUM : int  4 5 10 11 12 15 16 18 19 21 ...
##  $ SURVYEAR: int  2009 2009 2009 2009 2009 2009 2009 2009 2009 2009 ...
##  $ SURVMNTH: int  9 9 9 9 9 9 9 9 9 9 ...
##  $ LFSSTAT : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ PROV    : int  46 35 35 24 12 48 24 12 48 59 ...
##  $ CMA     : num  4 4 4 4 4 4 4 4 4 3 ...
##  $ AGE_12  : int  4 8 6 4 3 9 8 2 2 10 ...
##  $ SEX     : int  1 2 1 2 1 1 1 1 2 2 ...
##  $ MARSTAT : int  6 1 1 1 2 1 2 6 1 6 ...
##  $ EDUC    : int  4 4 2 4 5 1 4 2 4 1 ...
##  $ MJH     : int  1 1 1 2 1 1 1 1 1 1 ...
##  $ COWMAIN : int  2 2 2 2 1 2 2 2 2 2 ...
##  $ IMMIG   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ NAICS_18: num  17 8 5 14 13 6 4 12 14 14 ...
##  $ NOC_10  : num  7 2 10 5 5 8 8 2 4 7 ...
##  $ NOC_40  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ FTPTMAIN: int  1 1 1 2 2 1 1 1 1 1 ...
##  $ UTOTHRS : int  450 375 400 496 240 400 370 375 400 450 ...
##  $ TENURE  : int  105 14 115 36 7 237 39 11 29 39 ...
##  $ HRLYEARN: int  2564 1949 3750 1690 667 2550 2600 1380 2100 1700 ...
##  $ UNION   : int  3 3 1 3 1 1 1 3 3 1 ...
##  $ PERMTEMP: int  1 1 1 1 4 1 1 1 1 1 ...
##  $ ESTSIZE : int  2 2 4 1 4 4 4 3 1 1 ...
##  $ FIRMSIZE: int  2 2 4 1 4 4 4 4 1 1 ...
##  $ SCHOOLN : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ EFAMTYPE: int  1 2 3 2 2 5 5 1 2 1 ...
##  $ AGYOWNK : num  NA NA 2 NA NA NA NA NA NA NA ...
summary(data.all)
##     REC_NUM          SURVYEAR       SURVMNTH    LFSSTAT           PROV      
##  Min.   :     1   Min.   :2009   Min.   :9   Min.   :1.000   Min.   :10.00  
##  1st Qu.: 25978   1st Qu.:2009   1st Qu.:9   1st Qu.:1.000   1st Qu.:24.00  
##  Median : 51785   Median :2009   Median :9   Median :1.000   Median :35.00  
##  Mean   : 51925   Mean   :2014   Mean   :9   Mean   :1.071   Mean   :35.28  
##  3rd Qu.: 77717   3rd Qu.:2019   3rd Qu.:9   3rd Qu.:1.000   3rd Qu.:47.00  
##  Max.   :107593   Max.   :2019   Max.   :9   Max.   :2.000   Max.   :59.00  
##                                                                             
##       CMA            AGE_12            SEX           MARSTAT     
##  Min.   :1.000   Min.   : 1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.: 4.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :4.000   Median : 6.000   Median :2.000   Median :2.000  
##  Mean   :3.726   Mean   : 5.765   Mean   :1.503   Mean   :2.877  
##  3rd Qu.:4.000   3rd Qu.: 8.000   3rd Qu.:2.000   3rd Qu.:6.000  
##  Max.   :4.000   Max.   :12.000   Max.   :2.000   Max.   :6.000  
##                                                                  
##       EDUC            MJH           COWMAIN          IMMIG      
##  Min.   :0.000   Min.   :1.000   Min.   :1.000   Min.   :1.00   
##  1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:3.00   
##  Median :4.000   Median :1.000   Median :2.000   Median :3.00   
##  Mean   :3.474   Mean   :1.055   Mean   :1.733   Mean   :2.77   
##  3rd Qu.:4.000   3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:3.00   
##  Max.   :6.000   Max.   :2.000   Max.   :2.000   Max.   :3.00   
##                                                  NA's   :54557  
##     NAICS_18         NOC_10           NOC_40         FTPTMAIN    
##  Min.   : 1.00   Min.   : 1.000   Min.   : 1.00   Min.   :1.000  
##  1st Qu.: 7.00   1st Qu.: 3.000   1st Qu.:10.00   1st Qu.:1.000  
##  Median :11.00   Median : 7.000   Median :22.00   Median :1.000  
##  Mean   :10.63   Mean   : 5.425   Mean   :19.91   Mean   :1.183  
##  3rd Qu.:14.00   3rd Qu.: 7.000   3rd Qu.:28.00   3rd Qu.:1.000  
##  Max.   :18.00   Max.   :10.000   Max.   :40.00   Max.   :2.000  
##                                   NA's   :54557                  
##     UTOTHRS          TENURE          HRLYEARN         UNION      
##  Min.   :  4.0   Min.   :  1.00   Min.   :  200   Min.   :1.000  
##  1st Qu.:350.0   1st Qu.: 15.00   1st Qu.: 1500   1st Qu.:1.000  
##  Median :400.0   Median : 53.00   Median : 2100   Median :3.000  
##  Mean   :362.3   Mean   : 86.02   Mean   : 2437   Mean   :2.353  
##  3rd Qu.:400.0   3rd Qu.:143.00   3rd Qu.: 3077   3rd Qu.:3.000  
##  Max.   :990.0   Max.   :240.00   Max.   :11538   Max.   :3.000  
##                                                                  
##     PERMTEMP        ESTSIZE         FIRMSIZE        SCHOOLN     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:2.000   1st Qu.:1.000  
##  Median :1.000   Median :2.000   Median :3.000   Median :1.000  
##  Mean   :1.275   Mean   :2.094   Mean   :2.935   Mean   :1.117  
##  3rd Qu.:1.000   3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.:1.000  
##  Max.   :4.000   Max.   :4.000   Max.   :4.000   Max.   :3.000  
##                                                  NA's   :3002   
##     EFAMTYPE         AGYOWNK     
##  Min.   : 1.000   Min.   :1.00   
##  1st Qu.: 2.000   1st Qu.:1.00   
##  Median : 3.000   Median :2.00   
##  Mean   : 4.899   Mean   :2.21   
##  3rd Qu.: 5.000   3rd Qu.:3.00   
##  Max.   :18.000   Max.   :4.00   
##                   NA's   :65144
  • Missing Values
# SCHOOLN
data.all$SCHOOLN[is.na(data.all$SCHOOLN)] <- 4
summary(data.all$SCHOOLN)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.199   1.000   4.000
#AGYOWNK
data.all$AGYOWNK[is.na(data.all$AGYOWNK)] <- 5
summary(data.all$AGYOWNK)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.000   5.000   3.933   5.000   5.000
  • Factor Labels
# Labor Force Status
unique(data.all$LFSSTAT)
## [1] 1 2
str(data.all$LFSSTAT)
##  int [1:105488] 1 1 1 1 1 1 1 1 1 1 ...
data.all$LFSSTAT <- factor(data.all$LFSSTAT,
                           levels = c(1, 2),
                           labels = c("Employed, at work", "Employed, absent from work"))
str(data.all$LFSSTAT)
##  Factor w/ 2 levels "Employed, at work",..: 1 1 1 1 1 1 1 1 1 1 ...
# Province
data.all$PROV <- factor(data.all$PROV,
                           levels = c(10, 11, 12, 13, 24, 35, 46, 47, 48, 59),
                           labels = c("NL"
                                      , "PEI"
                                      , "NS"
                                      , "NB"
                                      , "QC"
                                      , "ON"
                                      , "MB"
                                      , "SK"
                                      , "AB"
                                      , "BC"))
str(data.all$PROV)
##  Factor w/ 10 levels "NL","PEI","NS",..: 7 6 6 5 3 9 5 3 9 10 ...
# CMA
sort(unique(data.all$CMA))
## [1] 1 2 3 4
str(data.all$CMA)
##  num [1:105488] 4 4 4 4 4 4 4 4 4 3 ...
data.all$CMA <- factor(data.all$CMA,
                           levels = c(1, 2, 3, 4),
                           labels = c("Montreal", "Toronto", "Vancouver", "Other"))
str(data.all$CMA)
##  Factor w/ 4 levels "Montreal","Toronto",..: 4 4 4 4 4 4 4 4 4 3 ...
# Age 12 groups
sort(unique(data.all$AGE_12))
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12
str(data.all$AGE_12)
##  int [1:105488] 4 8 6 4 3 9 8 2 2 10 ...
data.all$AGE_12 <- factor(data.all$AGE_12,
                       levels = c(1:12),
                       labels = c("15-19"
                                  , "20-24"
                                  , "25-29"
                                  , "30-34"
                                  , "35-39" 
                                  , "40-44" 
                                  , "45-49" 
                                  , "50-54"
                                  , "55-59" 
                                  , "60-64" 
                                  , "65-69" 
                                  , "70-over"),
                       ordered = TRUE)
str(data.all$AGE_12)
##  Ord.factor w/ 12 levels "15-19"<"20-24"<..: 4 8 6 4 3 9 8 2 2 10 ...
# Sex
sort(unique(data.all$SEX))
## [1] 1 2
str(data.all$SEX)
##  int [1:105488] 1 2 1 2 1 1 1 1 2 2 ...
data.all$SEX <- factor(data.all$SEX,
                          levels = c(1, 2),
                          labels = c("Male", "Female"))
str(data.all$SEX)
##  Factor w/ 2 levels "Male","Female": 1 2 1 2 1 1 1 1 2 2 ...
# Marital Status
sort(unique(data.all$MARSTAT))
## [1] 1 2 3 4 5 6
str(data.all$MARSTAT)
##  int [1:105488] 6 1 1 1 2 1 2 6 1 6 ...
data.all$MARSTAT <- factor(data.all$MARSTAT,
                       levels = c(1:6),
                       labels = c("Married"
                                  , "Common-law" # Living in common-law
                                  , "Widowed"
                                  , "Separated"
                                  , "Divorced"
                                  , "Single, NM")) # Single, never married
str(data.all$MARSTAT)
##  Factor w/ 6 levels "Married","Common-law",..: 6 1 1 1 2 1 2 6 1 6 ...
# Education Attainment
sort(unique(data.all$EDUC))
## [1] 0 1 2 3 4 5 6
str(data.all$EDUC)
##  int [1:105488] 4 4 2 4 5 1 4 2 4 1 ...
data.all$EDUC <- factor(data.all$EDUC,
                       levels = c(0:6),
                       labels = c("0 to 8 years"
                                  , "Some high school"
                                  , "High school graduate"
                                  , "Some postsecondary"
                                  , "Postsecondary certificate or diploma"
                                  , "Bachelor's degree"
                                  , "Above bachelor's degree"),
                       ordered = TRUE)
str(data.all$EDUC)
##  Ord.factor w/ 7 levels "0 to 8 years"<..: 5 5 3 5 6 2 5 3 5 2 ...
# Add Education column with SHORT NAMES
data.all$EDUCshort <- data.all$EDUC
levels(data.all$EDUCshort) <- list(No.HS = "0 to 8 years"
                                   , Some.HS = "Some high school"
                                   , HS.grad = "High school graduate"
                                   , Some.Post = "Some postsecondary"
                                   , Post.cert = "Postsecondary certificate or diploma"
                                   , Bachelor = "Bachelor's degree"
                                   , Above.B = "Above bachelor's degree")
str(data.all$EDUCshort)
##  Ord.factor w/ 7 levels "No.HS"<"Some.HS"<..: 5 5 3 5 6 2 5 3 5 2 ...
# Single or Multiple Jobholder
sort(unique(data.all$MJH))
## [1] 1 2
str(data.all$MJH)
##  int [1:105488] 1 1 1 2 1 1 1 1 1 1 ...
data.all$MJH <- factor(data.all$MJH,
                           levels = c(1,2),
                           labels = c("Single jobholder", "Multiple jobholder"))

str(data.all$MJH)
##  Factor w/ 2 levels "Single jobholder",..: 1 1 1 2 1 1 1 1 1 1 ...
# Class of Worker, Main Job
sort(unique(data.all$COWMAIN))
## [1] 1 2
str(data.all$COWMAIN)
##  int [1:105488] 2 2 2 2 1 2 2 2 2 2 ...
data.all$COWMAIN <- factor(data.all$COWMAIN,
                       levels = c(1,2),
                       labels = c("Public sector", "Private sector"))
str(data.all$COWMAIN)
##  Factor w/ 2 levels "Public sector",..: 2 2 2 2 1 2 2 2 2 2 ...
# Immigrant Status
sort(unique(data.all$IMMIG))
## [1] 1 2 3
str(data.all$IMMIG)
##  int [1:105488] NA NA NA NA NA NA NA NA NA NA ...
data.all$IMMIG <- factor(data.all$IMMIG,
                       levels = c(1,2,3),
                       labels = c("Immigrant, landed =< 10 years"
                                  , "Immigrant, landed > 10 years"
                                  , "Non-immigrant"))
str(data.all$IMMIG)
##  Factor w/ 3 levels "Immigrant, landed =< 10 years",..: NA NA NA NA NA NA NA NA NA NA ...
summary(data.all$IMMIG)
## Immigrant, landed =< 10 years  Immigrant, landed > 10 years 
##                          3236                          5232 
##                 Non-immigrant                          NA's 
##                         42463                         54557
# Industry of main job
sort(unique(data.all$NAICS_18))
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18
str(data.all$NAICS_18)
##  num [1:105488] 17 8 5 14 13 6 4 12 14 14 ...
data.all$NAICS_18 <- factor(data.all$NAICS_18,
                       levels = c(1:18),
                       labels = c("Agriculture"
                                  , "Forestry, Fishing, Min., Oil & Gas"
                                  , "Utilities"
                                  , "Construction"
                                  , "Manufacturing durables"
                                  , "Manufacturing non-durables"
                                  , "Wholesale Trade"
                                  , "Retail Trade"
                                  , "Transportation & Warehousing"
                                  , "Finance, Insurance, Real Est. & Leas."
                                  , "Prof., Scientific & Technical Services"
                                  , "Management, Admin. & Support"
                                  , "Educational Services"
                                  , "Health Care & Social Assistance"
                                  , "Information, Culture & Recreation"
                                  , "Accommodation & Food Services"
                                  , "Other Services"
                                  , "Public Administration"))
str(data.all$NAICS_18)
##  Factor w/ 18 levels "Agriculture",..: 17 8 5 14 13 6 4 12 14 14 ...
# Add Industry column with SHORT NAMES
data.all$NAICS_18short <- data.all$NAICS_18
levels(data.all$NAICS_18short) <- list(Agri = "Agriculture"
                                       , Fores = "Forestry, Fishing, Min., Oil & Gas" # "Forestry, Fishing, Mining, Oil and Gas"
                                       , Utils = "Utilities"
                                       , Const = "Construction"
                                       , ManuD = "Manufacturing durables"
                                       , ManuN = "Manufacturing non-durables"
                                       , Whole = "Wholesale Trade"
                                       , Rtail = "Retail Trade"
                                       , Trans = "Transportation & Warehousing"
                                       , Finan = "Finance, Insurance, Real Est. & Leas." # "Finance, Insurance, Real Estate and Leasing"
                                       , ProSc = "Prof., Scientific & Technical Services" # "Professional, Scientific and Technical Services"
                                       , Mngt = "Management, Admin. & Support" # "Management, Administrative and Other Support"
                                       , Educa = "Educational Services"
                                       , Health = "Health Care & Social Assistance"
                                       , Info = "Information, Culture & Recreation"
                                       , AcFood = "Accommodation & Food Services"
                                       , Other = "Other Services"
                                       , PubAd = "Public Administration")
str(data.all$NAICS_18short)
##  Factor w/ 18 levels "Agri","Fores",..: 17 8 5 14 13 6 4 12 14 14 ...
# Occupation at main job (10 categories)
sort(unique(data.all$NOC_10))
##  [1]  1  2  3  4  5  6  7  8  9 10
str(data.all$NOC_10)
##  num [1:105488] 7 2 10 5 5 8 8 2 4 7 ...
data.all$NOC_10 <- factor(data.all$NOC_10,
                            levels = c(1:10),
                            labels = c("Management"
                                       , "Business, finance & administration"
                                       , "Natural & applied sciences"
                                       , "Health"
                                       , "Educ., law, community & gov. services" # "Education, law and social, community and government services"
                                       , "Art, culture, recreation & sport"
                                       , "Sales & service"
                                       , "Trades, transport & equipm. operators" # "Trades, transport and equipment operators"
                                       , "Natural resources & agriculture"
                                       , "Manufacturing & utilities"))
str(data.all$NOC_10)
##  Factor w/ 10 levels "Management","Business, finance & administration",..: 7 2 10 5 5 8 8 2 4 7 ...
summary(data.all$NOC_10)
##                            Management    Business, finance & administration 
##                                  6543                                 17907 
##            Natural & applied sciences                                Health 
##                                  6963                                  8000 
## Educ., law, community & gov. services      Art, culture, recreation & sport 
##                                 11514                                  1813 
##                       Sales & service Trades, transport & equipm. operators 
##                                 27360                                 16821 
##       Natural resources & agriculture             Manufacturing & utilities 
##                                  3153                                  5414
# Add Occupation column with SHORT NAMES
data.all$NOC_10short <- data.all$NOC_10
levels(data.all$NOC_10short) <- list(Mngt = "Management"
                                    , BusFin = "Business, finance & administration"
                                    , NatASc = "Natural & applied sciences"
                                    , Health ="Health"
                                    , EduLaw = "Educ., law, community & gov. services"
                                    , ArtCul = "Art, culture, recreation & sport"
                                    , Sales = "Sales & service"
                                    , Trades = "Trades, transport & equipm. operators"
                                    , NatAgri = "Natural resources & agriculture"
                                    , ManUtil = "Manufacturing & utilities")
str(data.all$NOC_10short)
##  Factor w/ 10 levels "Mngt","BusFin",..: 7 2 10 5 5 8 8 2 4 7 ...
# Occupation at main job (40 categories)
sort(unique(data.all$NOC_40))
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
str(data.all$NOC_40)
##  int [1:105488] NA NA NA NA NA NA NA NA NA NA ...
data.all$NOC_40 <- factor(data.all$NOC_40,
                          levels = c(1:40),
                          labels = c("Senior management"
                                     ,"Specialized middle management"
                                     ,"Middle management in retail and wholesale trade and customer services"
                                     ,"Middle management in trades, transportation, production and utilities"
                                     ,"Professional occupations in business and finance"
                                     ,"Administrative and financial supervisors and administrative occupations"
                                     ,"Finance, insurance and related business administrative occupations"
                                     ,"Office support"
                                     ,"Distribution, tracking and scheduling co-ordination"
                                     ,"Professional occupations in natural and applied sciences"
                                     ,"Technical occupations related to natural and applied sciences"
                                     ,"Professional occupations in nursing"
                                     ,"Professional occupations in health (except nursing)"
                                     ,"Technical occupations in health"
                                     ,"Assisting occupations in support of health services"
                                     ,"Professional occupations in education services"
                                     ,"Professional occupations in law and social, community and government services"
                                     ,"Paraprofessional occupations in legal, social, community and education services"
                                     ,"Front-line public protection services"
                                     ,"Care providers and educational, legal and public protection support"
                                     ,"Professional occupations in art and culture"
                                     ,"Technical occupations in art, culture, recreation and sport"
                                     ,"Retail sales supervisors and specialized sales"
                                     ,"Service supervisors and specialized service"
                                     ,"Sales representatives and salespersons - wholesale and retail trade"
                                     ,"Service representatives and other customer and personal services occupations"
                                     ,"Sales support"
                                     ,"Service support and other service occupations, n.e.c."
                                     ,"Industrial, electrical and construction trades"
                                     ,"Maintenance and equipment operation trades"
                                     ,"Other installers, repairers and servicers and material handlers"
                                     ,"Transport and heavy equipment operation and related maintenance occupations"
                                     ,"Trades helpers, construction labourers and related occupations"
                                     ,"Supervisors and technical occupations in natural resources, agriculture and related production"
                                     ,"Workers in natural resources, agriculture and related production"
                                     ,"Harvesting, landscaping and natural resources labourers"
                                     ,"Processing, manufacturing and utilities supervisors and central control operators"
                                     ,"Processing and manufacturing machine operators and related production workers"
                                     ,"Assemblers in manufacturing"
                                     ,"Labourers in processing, manufacturing and utilities"))
str(data.all$NOC_40)
##  Factor w/ 40 levels "Senior management",..: NA NA NA NA NA NA NA NA NA NA ...
summary(data.all$NOC_40)
##                                                                              Senior management 
##                                                                                            138 
##                                                                  Specialized middle management 
##                                                                                           1490 
##                          Middle management in retail and wholesale trade and customer services 
##                                                                                            626 
##                          Middle management in trades, transportation, production and utilities 
##                                                                                            702 
##                                               Professional occupations in business and finance 
##                                                                                           1576 
##                        Administrative and financial supervisors and administrative occupations 
##                                                                                           3078 
##                             Finance, insurance and related business administrative occupations 
##                                                                                            621 
##                                                                                 Office support 
##                                                                                           1952 
##                                            Distribution, tracking and scheduling co-ordination 
##                                                                                            858 
##                                       Professional occupations in natural and applied sciences 
##                                                                                           1827 
##                                  Technical occupations related to natural and applied sciences 
##                                                                                           1756 
##                                                            Professional occupations in nursing 
##                                                                                           1115 
##                                            Professional occupations in health (except nursing) 
##                                                                                            499 
##                                                                Technical occupations in health 
##                                                                                           1230 
##                                            Assisting occupations in support of health services 
##                                                                                           1233 
##                                                 Professional occupations in education services 
##                                                                                           2479 
##                  Professional occupations in law and social, community and government services 
##                                                                                           1377 
##                Paraprofessional occupations in legal, social, community and education services 
##                                                                                           1245 
##                                                          Front-line public protection services 
##                                                                                            355 
##                            Care providers and educational, legal and public protection support 
##                                                                                            915 
##                                                    Professional occupations in art and culture 
##                                                                                            229 
##                                    Technical occupations in art, culture, recreation and sport 
##                                                                                            632 
##                                                 Retail sales supervisors and specialized sales 
##                                                                                           1548 
##                                                    Service supervisors and specialized service 
##                                                                                           1855 
##                            Sales representatives and salespersons - wholesale and retail trade 
##                                                                                           2199 
##                   Service representatives and other customer and personal services occupations 
##                                                                                           2354 
##                                                                                  Sales support 
##                                                                                           1926 
##                                          Service support and other service occupations, n.e.c. 
##                                                                                           2900 
##                                                 Industrial, electrical and construction trades 
##                                                                                           2557 
##                                                     Maintenance and equipment operation trades 
##                                                                                           1883 
##                                Other installers, repairers and servicers and material handlers 
##                                                                                            802 
##                    Transport and heavy equipment operation and related maintenance occupations 
##                                                                                           2205 
##                                 Trades helpers, construction labourers and related occupations 
##                                                                                            608 
## Supervisors and technical occupations in natural resources, agriculture and related production 
##                                                                                            583 
##                               Workers in natural resources, agriculture and related production 
##                                                                                            616 
##                                        Harvesting, landscaping and natural resources labourers 
##                                                                                            373 
##              Processing, manufacturing and utilities supervisors and central control operators 
##                                                                                            638 
##                  Processing and manufacturing machine operators and related production workers 
##                                                                                            949 
##                                                                    Assemblers in manufacturing 
##                                                                                            515 
##                                           Labourers in processing, manufacturing and utilities 
##                                                                                            487 
##                                                                                           NA's 
##                                                                                          54557
# Full time or Part time, main job
sort(unique(data.all$FTPTMAIN))
## [1] 1 2
str(data.all$FTPTMAIN)
##  int [1:105488] 1 1 1 2 2 1 1 1 1 1 ...
data.all$FTPTMAIN <- factor(data.all$FTPTMAIN,
                           levels = c(1,2),
                           labels = c("Full-time", "Part-time"))
str(data.all$FTPTMAIN)
##  Factor w/ 2 levels "Full-time","Part-time": 1 1 1 2 2 1 1 1 1 1 ...
summary(data.all$FTPTMAIN)
## Full-time Part-time 
##     86157     19331
# Union
sort(unique(data.all$UNION))
## [1] 1 2 3
str(data.all$UNION)
##  int [1:105488] 3 3 1 3 1 1 1 3 3 1 ...
data.all$UNION <- factor(data.all$UNION,
                         levels = c(1:3),
                         labels = c("Union member"
                                    , "Not member but covered" # Not a member but covered by a union contract
                                    , "Non-unionized"))
str(data.all$UNION)
##  Factor w/ 3 levels "Union member",..: 3 3 1 3 1 1 1 3 3 1 ...
summary(data.all$UNION)
##           Union member Not member but covered          Non-unionized 
##                  33074                   2116                  70298
# Job permanency
sort(unique(data.all$PERMTEMP))
## [1] 1 2 3 4
str(data.all$PERMTEMP)
##  int [1:105488] 1 1 1 1 4 1 1 1 1 1 ...
data.all$PERMTEMP <- factor(data.all$PERMTEMP,
                         levels = c(1:4),
                         labels = c("Permanent"
                                    , "Temp. season" # Temporary, seasonal
                                    , "Temp. contract" # Temporary, term or contract
                                    , "Temp. casual")) # Temporary, casual or other
str(data.all$PERMTEMP)
##  Factor w/ 4 levels "Permanent","Temp. season",..: 1 1 1 1 4 1 1 1 1 1 ...
summary(data.all$PERMTEMP)
##      Permanent   Temp. season Temp. contract   Temp. casual 
##          90764           4305           6583           3836
# Establishment Size (number of employees)
sort(unique(data.all$ESTSIZE))
## [1] 1 2 3 4
str(data.all$ESTSIZE)
##  int [1:105488] 2 2 4 1 4 4 4 3 1 1 ...
data.all$ESTSIZE <- factor(data.all$ESTSIZE,
                            levels = c(1:4),
                            labels = c("<20"
                                       , "20-99"
                                       , "100-500"
                                       , ">500"),
                           ordered = TRUE)
str(data.all$ESTSIZE)
##  Ord.factor w/ 4 levels "<20"<"20-99"<..: 2 2 4 1 4 4 4 3 1 1 ...
summary(data.all$ESTSIZE)
##     <20   20-99 100-500    >500 
##   36269   35799   20604   12816
# Firm Size (number of employees)
sort(unique(data.all$FIRMSIZE))
## [1] 1 2 3 4
str(data.all$FIRMSIZE)
##  int [1:105488] 2 2 4 1 4 4 4 4 1 1 ...
data.all$FIRMSIZE <- factor(data.all$FIRMSIZE,
                           levels = c(1:4),
                           labels = c("<20"
                                      , "20-99"
                                      , "100-500"
                                      , ">500"),
                           ordered = TRUE)
str(data.all$FIRMSIZE)
##  Ord.factor w/ 4 levels "<20"<"20-99"<..: 2 2 4 1 4 4 4 4 1 1 ...
summary(data.all$FIRMSIZE)
##     <20   20-99 100-500    >500 
##   20662   17463   15467   51896
# Current Student Status
sort(unique(data.all$SCHOOLN))
## [1] 1 2 3 4
str(data.all$SCHOOLN)
##  num [1:105488] 1 1 1 1 1 1 1 1 1 1 ...
data.all$SCHOOLN <- factor(data.all$SCHOOLN,
                            levels = c(1:4),
                            labels = c("Non-student"
                                       , "Full-time student"
                                       , "Part-time student"
                                       , "Unknown"))
str(data.all$SCHOOLN)
##  Factor w/ 4 levels "Non-student",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(data.all$SCHOOLN)
##       Non-student Full-time student Part-time student           Unknown 
##             92872              7260              2354              3002
# Type of Economic Family 
# Labels meaning:
# Ind: Unattached individual
# HWDENC: Husband-wife, dual earner couple, no children or none under 25
# HWDE17: Husband-wife, dual earner couple, youngest child 0 to 17
# HWDE24: Husband-wife, dual earner couple, youngest child 18 to 24
# HWSHNC: Husband-wife, single earner couple, husband employed, no children or none under 25
# HWSH17: Husband-wife, single earner couple, husband employed, youngest child 0 to 17
# HWSH24: Husband-wife, single earner couple, husband employed, youngest child 18 to 24
# HWSWNC: Husband-wife, single earner couple, wife employed, no children or none under 25
# HWSW17: Husband-wife, single earner couple, wife employed, youngest child 0 to 17
# HWSW24: Husband-wife, single earner couple, wife employed, youngest child 18 to 24
# HWNENC: Husband-wife, non-earner couple, no children or none under 25
# HWNE17: Husband-wife, non-earner couple, youngest child 0 to 17
# HWNE24: Husband-wife, non-earner couple, youngest child 18 to 24
# SPE17: Single-parent family, parent employed, youngest child 0 to 17
# SPE24: Single-parent family, parent employed, youngest child 18 to 24
# SPN17: Single-parent family, parent not employed, youngest child 0 to 17
# SPN24: Single-parent family, parent not employed, youngest child 18 to 24
# Other: Other families
sort(unique(data.all$EFAMTYPE))
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18
str(data.all$EFAMTYPE)
##  int [1:105488] 1 2 3 2 2 5 5 1 2 1 ...
data.all$EFAMTYPE <- factor(data.all$EFAMTYPE,
                           levels = c(1:18),
                           labels = c("Ind"
                                      , "HWDENC"
                                      , "HWDE17"
                                      , "HWDE24"
                                      , "HWSHNC"
                                      , "HWSH17"
                                      , "HWSH24"
                                      , "HWSWNC"
                                      , "HWSW17"
                                      , "HWSW24"
                                      , "HWNENC"
                                      , "HWNE17"
                                      , "HWNE24"
                                      , "SPE17"
                                      , "SPE24"
                                      , "SPN17"
                                      , "SPN24"
                                      , "Other"))
str(data.all$EFAMTYPE)
##  Factor w/ 18 levels "Ind","HWDENC",..: 1 2 3 2 2 5 5 1 2 1 ...
summary(data.all$EFAMTYPE)
##    Ind HWDENC HWDE17 HWDE24 HWSHNC HWSH17 HWSH24 HWSWNC HWSW17 HWSW24 HWNENC 
##  16842  22136  28911   7892   4053   4009    986   3423   1355    712    749 
## HWNE17 HWNE24  SPE17  SPE24  SPN17  SPN24  Other 
##    109    181   4991   2024    267    189   6659
# Age of Youngest Child (Years)
sort(unique(data.all$AGYOWNK))
## [1] 1 2 3 4 5
str(data.all$AGYOWNK)
##  num [1:105488] 5 5 2 5 5 5 5 5 5 5 ...
data.all$AGYOWNK <- factor(data.all$AGYOWNK,
                            levels = c(1:5),
                            labels = c(">6"
                                       , "6-12"
                                       , "13-17"
                                       , "18-24"
                                       , ">24|NC"), # |NC means OR No Children
                            ordered = TRUE)
str(data.all$AGYOWNK)
##  Ord.factor w/ 5 levels ">6"<"6-12"<"13-17"<..: 5 5 2 5 5 5 5 5 5 5 ...
summary(data.all$AGYOWNK)
##     >6   6-12  13-17  18-24 >24|NC 
##  13510  11724   8270   6840  65144
str(data.all)
## 'data.frame':    105488 obs. of  30 variables:
##  $ REC_NUM      : int  4 5 10 11 12 15 16 18 19 21 ...
##  $ SURVYEAR     : int  2009 2009 2009 2009 2009 2009 2009 2009 2009 2009 ...
##  $ SURVMNTH     : int  9 9 9 9 9 9 9 9 9 9 ...
##  $ LFSSTAT      : Factor w/ 2 levels "Employed, at work",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ PROV         : Factor w/ 10 levels "NL","PEI","NS",..: 7 6 6 5 3 9 5 3 9 10 ...
##  $ CMA          : Factor w/ 4 levels "Montreal","Toronto",..: 4 4 4 4 4 4 4 4 4 3 ...
##  $ AGE_12       : Ord.factor w/ 12 levels "15-19"<"20-24"<..: 4 8 6 4 3 9 8 2 2 10 ...
##  $ SEX          : Factor w/ 2 levels "Male","Female": 1 2 1 2 1 1 1 1 2 2 ...
##  $ MARSTAT      : Factor w/ 6 levels "Married","Common-law",..: 6 1 1 1 2 1 2 6 1 6 ...
##  $ EDUC         : Ord.factor w/ 7 levels "0 to 8 years"<..: 5 5 3 5 6 2 5 3 5 2 ...
##  $ MJH          : Factor w/ 2 levels "Single jobholder",..: 1 1 1 2 1 1 1 1 1 1 ...
##  $ COWMAIN      : Factor w/ 2 levels "Public sector",..: 2 2 2 2 1 2 2 2 2 2 ...
##  $ IMMIG        : Factor w/ 3 levels "Immigrant, landed =< 10 years",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ NAICS_18     : Factor w/ 18 levels "Agriculture",..: 17 8 5 14 13 6 4 12 14 14 ...
##  $ NOC_10       : Factor w/ 10 levels "Management","Business, finance & administration",..: 7 2 10 5 5 8 8 2 4 7 ...
##  $ NOC_40       : Factor w/ 40 levels "Senior management",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ FTPTMAIN     : Factor w/ 2 levels "Full-time","Part-time": 1 1 1 2 2 1 1 1 1 1 ...
##  $ UTOTHRS      : int  450 375 400 496 240 400 370 375 400 450 ...
##  $ TENURE       : int  105 14 115 36 7 237 39 11 29 39 ...
##  $ HRLYEARN     : int  2564 1949 3750 1690 667 2550 2600 1380 2100 1700 ...
##  $ UNION        : Factor w/ 3 levels "Union member",..: 3 3 1 3 1 1 1 3 3 1 ...
##  $ PERMTEMP     : Factor w/ 4 levels "Permanent","Temp. season",..: 1 1 1 1 4 1 1 1 1 1 ...
##  $ ESTSIZE      : Ord.factor w/ 4 levels "<20"<"20-99"<..: 2 2 4 1 4 4 4 3 1 1 ...
##  $ FIRMSIZE     : Ord.factor w/ 4 levels "<20"<"20-99"<..: 2 2 4 1 4 4 4 4 1 1 ...
##  $ SCHOOLN      : Factor w/ 4 levels "Non-student",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ EFAMTYPE     : Factor w/ 18 levels "Ind","HWDENC",..: 1 2 3 2 2 5 5 1 2 1 ...
##  $ AGYOWNK      : Ord.factor w/ 5 levels ">6"<"6-12"<"13-17"<..: 5 5 2 5 5 5 5 5 5 5 ...
##  $ EDUCshort    : Ord.factor w/ 7 levels "No.HS"<"Some.HS"<..: 5 5 3 5 6 2 5 3 5 2 ...
##  $ NAICS_18short: Factor w/ 18 levels "Agri","Fores",..: 17 8 5 14 13 6 4 12 14 14 ...
##  $ NOC_10short  : Factor w/ 10 levels "Mngt","BusFin",..: 7 2 10 5 5 8 8 2 4 7 ...
  • Numeric attributes value adjustment and stats summary
# NUMERIC VARIABLES REAL VALUES
data.all$UTOTHRS <- data.all$UTOTHRS/10
data.all$HRLYEARN <- data.all$HRLYEARN/100

# NUMERIC VARIABLES SUMMARY
num.vars <- c("UTOTHRS", "TENURE", "HRLYEARN")
summary(data.all[data.all$SURVYEAR == 2009, num.vars])
##     UTOTHRS          TENURE         HRLYEARN     
##  Min.   : 0.40   Min.   :  1.0   Min.   :  2.00  
##  1st Qu.:35.00   1st Qu.: 15.0   1st Qu.: 13.00  
##  Median :40.00   Median : 51.0   Median : 19.00  
##  Mean   :36.13   Mean   : 85.3   Mean   : 21.47  
##  3rd Qu.:40.00   3rd Qu.:139.0   3rd Qu.: 27.24  
##  Max.   :99.00   Max.   :240.0   Max.   :115.38
summary(data.all[data.all$SURVYEAR == 2019, num.vars])
##     UTOTHRS          TENURE          HRLYEARN     
##  Min.   : 1.00   Min.   :  1.00   Min.   :  3.00  
##  1st Qu.:35.00   1st Qu.: 15.00   1st Qu.: 17.00  
##  Median :40.00   Median : 55.00   Median : 24.00  
##  Mean   :36.34   Mean   : 86.79   Mean   : 27.48  
##  3rd Qu.:40.00   3rd Qu.:146.00   3rd Qu.: 34.97  
##  Max.   :99.00   Max.   :240.00   Max.   :107.96
sd(data.all$UTOTHRS[data.all$SURVYEAR == 2009])
## [1] 11.23495
sd(data.all$UTOTHRS[data.all$SURVYEAR == 2019])
## [1] 11.16521
sd(data.all$TENURE[data.all$SURVYEAR == 2009])
## [1] 83.43826
sd(data.all$TENURE[data.all$SURVYEAR == 2019])
## [1] 82.60483
sd(data.all$HRLYEARN[data.all$SURVYEAR == 2009])
## [1] 11.27651
sd(data.all$HRLYEARN[data.all$SURVYEAR == 2019])
## [1] 13.66143

STEP 2: Variable Analysis by Groups

# DESCRIPTIVE STATISTICS BY YEAR AND GENDER
summary(data.all[data.all$SURVYEAR == 2009 & data.all$SEX == "Male", num.vars])
##     UTOTHRS          TENURE          HRLYEARN     
##  Min.   : 0.40   Min.   :  1.00   Min.   :  2.14  
##  1st Qu.:37.50   1st Qu.: 15.00   1st Qu.: 14.50  
##  Median :40.00   Median : 51.00   Median : 20.51  
##  Mean   :39.01   Mean   : 86.29   Mean   : 23.25  
##  3rd Qu.:40.00   3rd Qu.:144.00   3rd Qu.: 29.80  
##  Max.   :99.00   Max.   :240.00   Max.   :115.38
summary(data.all[data.all$SURVYEAR == 2019 & data.all$SEX == "Male", num.vars])
##     UTOTHRS          TENURE          HRLYEARN     
##  Min.   : 1.00   Min.   :  1.00   Min.   :  3.00  
##  1st Qu.:37.50   1st Qu.: 15.00   1st Qu.: 18.00  
##  Median :40.00   Median : 53.00   Median : 25.65  
##  Mean   :38.76   Mean   : 85.67   Mean   : 29.29  
##  3rd Qu.:40.00   3rd Qu.:144.00   3rd Qu.: 37.00  
##  Max.   :99.00   Max.   :240.00   Max.   :107.96
summary(data.all[data.all$SURVYEAR == 2009 & data.all$SEX == "Female", num.vars])
##     UTOTHRS          TENURE          HRLYEARN    
##  Min.   : 0.40   Min.   :  1.00   Min.   : 2.00  
##  1st Qu.:30.00   1st Qu.: 16.00   1st Qu.:11.75  
##  Median :37.00   Median : 50.00   Median :17.14  
##  Mean   :33.33   Mean   : 84.35   Mean   :19.74  
##  3rd Qu.:40.00   3rd Qu.:135.00   3rd Qu.:24.91  
##  Max.   :99.00   Max.   :240.00   Max.   :89.74
summary(data.all[data.all$SURVYEAR == 2019 & data.all$SEX == "Female", num.vars])
##     UTOTHRS          TENURE          HRLYEARN     
##  Min.   : 1.00   Min.   :  1.00   Min.   :  3.07  
##  1st Qu.:30.00   1st Qu.: 16.00   1st Qu.: 16.00  
##  Median :37.50   Median : 57.00   Median : 22.00  
##  Mean   :33.91   Mean   : 87.91   Mean   : 25.66  
##  3rd Qu.:40.00   3rd Qu.:148.00   3rd Qu.: 31.79  
##  Max.   :99.00   Max.   :240.00   Max.   :106.67
apply(data.all[data.all$SURVYEAR == 2009 & data.all$SEX == "Male", num.vars], 2, sd)
##  UTOTHRS   TENURE HRLYEARN 
## 10.94336 84.67368 11.94321
apply(data.all[data.all$SURVYEAR == 2009 & data.all$SEX == "Female", num.vars], 2, sd)
##  UTOTHRS   TENURE HRLYEARN 
## 10.79760 82.20528 10.29534
apply(data.all[data.all$SURVYEAR == 2019 & data.all$SEX == "Male", num.vars], 2, sd)
##  UTOTHRS   TENURE HRLYEARN 
## 11.04963 82.52859 14.45311
apply(data.all[data.all$SURVYEAR == 2019 & data.all$SEX == "Female", num.vars], 2, sd)
##  UTOTHRS   TENURE HRLYEARN 
## 10.74592 82.66763 12.55978
# OUTLIERS IN NUMERIC VARIABLES
hours <- boxplot(UTOTHRS ~ SURVYEAR + SEX, data = data.all, main = "Usual hours worked")

hours[c("stats", "n", "names")]
## $stats
##      [,1] [,2] [,3] [,4]
## [1,] 33.8 33.8   15 15.0
## [2,] 37.5 37.5   30 30.0
## [3,] 40.0 40.0   37 37.5
## [4,] 40.0 40.0   40 40.0
## [5,] 43.5 43.7   55 55.0
## 
## $n
## [1] 26942 25499 27615 25432
## 
## $names
## [1] "2009.Male"   "2019.Male"   "2009.Female" "2019.Female"
table(hours$group)
## 
##    1    2    3    4 
## 8453 8080 2324 2096
tenure <- boxplot(TENURE ~ SURVYEAR + SEX, data = data.all, main = "Job tenure with current employer")

tenure[c("stats","n")]
## $stats
##      [,1] [,2] [,3] [,4]
## [1,]    1    1    1    1
## [2,]   15   15   16   16
## [3,]   51   53   50   57
## [4,]  144  144  135  148
## [5,]  240  240  240  240
## attr(,"class")
## 2009.Male 
## "integer" 
## 
## $n
## [1] 26942 25499 27615 25432
table(tenure$group)
## < table of extent 0 >
wage <- boxplot(HRLYEARN ~ SURVYEAR + SEX, data = data.all, main = "Usual hourly wages")

wage[c("stats","n")]
## $stats
##       [,1]  [,2]  [,3]  [,4]
## [1,]  2.14  3.00  2.00  3.07
## [2,] 14.50 18.00 11.75 16.00
## [3,] 20.51 25.65 17.14 22.00
## [4,] 29.80 37.00 24.91 31.79
## [5,] 52.69 65.38 44.62 55.38
## 
## $n
## [1] 26942 25499 27615 25432
table(wage$group)
## 
##   1   2   3   4 
## 675 618 708 681
table(data.all$SURVYEAR, data.all$SEX)
##       
##         Male Female
##   2009 26942  27615
##   2019 25499  25432
data.all.09male <- data.all[data.all$SURVYEAR == 2009 & data.all$SEX == "Male",]
data.all.19male <- data.all[data.all$SURVYEAR == 2019 & data.all$SEX == "Male",]
data.all.09fem <- data.all[data.all$SURVYEAR == 2009 & data.all$SEX == "Female",]
data.all.19fem <- data.all[data.all$SURVYEAR == 2019 & data.all$SEX == "Female",]
# HISTOGRAMS ###################################################################
# par(mfrow=c(2, 2))
par(mar=c(4.1, 4.1, 1.1, 1.1))

# Usual hours worked
# Males 2009
hist(data.all.09male$UTOTHRS
     , xlim = c(0, 100)
     , ylim = c(0, 0.25)
     , breaks = 50
     , freq   = FALSE
     , cex.lab = 1.5
     , cex.axis = 1.5
     , col    = "slategray2"
     , main   = ""
     , xlab   = "Usual hours worked")
curve(dnorm(x, mean = mean(data.all.09male$UTOTHRS)
            , sd = sd(data.all.09male$UTOTHRS))
            , col = "royalblue4"
            , lwd = 2
            , add = TRUE)

# Males 2019
hist(data.all.19male$UTOTHRS
     , xlim = c(0, 100)
     , ylim = c(0, 0.25)
     , breaks = 50
     , freq   = FALSE
     , cex.lab = 1.5
     , cex.axis = 1.5
     , col    = "slategray2"
     , main   = ""
     , xlab   = "Usual hours worked")
curve(dnorm(x, mean = mean(data.all.19male$UTOTHRS)
            , sd = sd(data.all.19male$UTOTHRS))
      , col = "royalblue4"
      , lwd = 2
      , add = TRUE)

# Females 2009
hist(data.all.09fem$UTOTHRS
     , xlim = c(0, 100)
     , ylim = c(0, 0.25)
     , breaks = 50
     , freq   = FALSE
     , cex.lab = 1.5
     , cex.axis = 1.5
     , col    = "moccasin"
     , main   = ""
     , xlab   = "Usual hours worked")
curve(dnorm(x, mean = mean(data.all.09fem$UTOTHRS)
            , sd = sd(data.all.09fem$UTOTHRS))
      , col = "darkorange2"
      , lwd = 2
      , add = TRUE)

# Females 2019
hist(data.all.19fem$UTOTHRS
     , xlim = c(0, 100)
     , ylim = c(0, 0.25)
     , breaks = 50
     , freq   = FALSE
     , cex.lab = 1.5
     , cex.axis = 1.5
     , col    = "moccasin"
     , main   = ""
     , xlab   = "Usual hours worked")
curve(dnorm(x, mean = mean(data.all.19fem$UTOTHRS)
            , sd = sd(data.all.19fem$UTOTHRS))
      , col = "darkorange2"
      , lwd = 2
      , add = TRUE)

# Tenure
# Males 2009
hist(data.all.09male$TENURE
     , xlim = c(0, 240)
     , ylim = c(0, 0.03)
     , breaks = 50
     , freq   = FALSE
     , cex.lab = 1.5
     , cex.axis = 1.5
     , col    = "slategray2"
     , main   = ""
     , xlab   = "Tenure with current employer in months")
curve(dnorm(x, mean = mean(data.all.09male$TENURE)
            , sd = sd(data.all.09male$TENURE))
      , col = "royalblue4"
      , lwd = 2
      , add = TRUE)

# Males 2019
hist(data.all.19male$TENURE
     , xlim = c(0, 240)
     , ylim = c(0, 0.03)
     , breaks = 50
     , freq   = FALSE
     , cex.lab = 1.5
     , cex.axis = 1.5
     , col    = "slategray2"
     , main   = ""
     , xlab   = "Tenure with current employer in months")
curve(dnorm(x, mean = mean(data.all.19male$TENURE)
            , sd = sd(data.all.19male$TENURE))
      , col = "royalblue4"
      , lwd = 2
      , add = TRUE)

# Females 2009
hist(data.all.09fem$TENURE
     , xlim = c(0, 240)
     , ylim = c(0, 0.03)
     , breaks = 50
     , freq   = FALSE
     , cex.lab = 1.5
     , cex.axis = 1.5
     , col    = "moccasin"
     , main   = ""
     , xlab   = "Tenure with current employer in months")
curve(dnorm(x, mean = mean(data.all.09fem$TENURE)
            , sd = sd(data.all.09fem$TENURE))
      , col = "darkorange2"
      , lwd = 2
      , add = TRUE)

# Females 2019
hist(data.all.19fem$TENURE
     , xlim = c(0, 240)
     , ylim = c(0, 0.03)
     , breaks = 50
     , freq   = FALSE
     , cex.lab = 1.5
     , cex.axis = 1.5
     , col    = "moccasin"
     , main   = ""
     , xlab   = "Tenure with current employer in months")
curve(dnorm(x, mean = mean(data.all.19fem$TENURE)
            , sd = sd(data.all.19fem$TENURE))
      , col = "darkorange2"
      , lwd = 2
      , add = TRUE)

# Wages
# Males 2009
hist(data.all.09male$HRLYEARN
     , xlim = c(0, 120)
     , ylim = c(0, 0.085)
     , breaks = 50
     , freq   = FALSE
     , cex.lab = 1.5
     , cex.axis = 1.5
     , col    = "slategray2"
     , main   = ""
     , xlab   = "Usual hourly wages")
curve(dnorm(x, mean = mean(data.all.09male$HRLYEARN)
            , sd = sd(data.all.09male$HRLYEARN))
      , col = "royalblue4"
      , lwd = 2
      , add = TRUE)

# Males 2019
hist(data.all.19male$HRLYEARN
     , xlim = c(0, 120)
     , ylim = c(0, 0.085)
     , breaks = 50
     , freq   = FALSE
     , cex.lab = 1.5
     , cex.axis = 1.5
     , col    = "slategray2"
     , main   = ""
     , xlab   = "Usual hourly wages")
curve(dnorm(x, mean = mean(data.all.19male$HRLYEARN)
            , sd = sd(data.all.19male$HRLYEARN))
      , col = "royalblue4"
      , lwd = 2
      , add = TRUE)

# Females 2009
hist(data.all.09fem$HRLYEARN
     , xlim = c(0, 120)
     , ylim = c(0, 0.085)
     , breaks = 50
     , freq   = FALSE
     , col    = "moccasin"
     , main   = ""
     , xlab   = "Usual hourly wages")
curve(dnorm(x, mean = mean(data.all.09fem$HRLYEARN)
            , sd = sd(data.all.09fem$HRLYEARN))
      , col = "darkorange2"
      , lwd = 2
      , add = TRUE)

# Females 2019
hist(data.all.19fem$HRLYEARN
     , xlim = c(0, 120)
     , ylim = c(0, 0.085)
     , breaks = 50
     , freq   = FALSE
     , cex.lab = 1.5
     , cex.axis = 1.5
     , col    = "moccasin"
     , main   = ""
     , xlab   = "Usual hourly wages")
curve(dnorm(x, mean = mean(data.all.19fem$HRLYEARN)
            , sd = sd(data.all.19fem$HRLYEARN))
      , col = "darkorange2"
      , lwd = 2
      , add = TRUE)

# QQ PLOTS #####################################################################
# Usual hours worked
ggqqplot(data.all.09male$UTOTHRS, title = "Males 2009")

ggqqplot(data.all.19male$UTOTHRS, title = "Males 2019")

ggqqplot(data.all.09fem$UTOTHRS, title = "Females 2009")

ggqqplot(data.all.19male$UTOTHRS, title = "Females 2019")

# Tenure
ggqqplot(data.all.09male$TENURE, title = "Males 2009")

ggqqplot(data.all.19male$TENURE, title = "Males 2019")

ggqqplot(data.all.09fem$TENURE, title = "Females 2009")

ggqqplot(data.all.19male$TENURE, title = "Females 2019")

# Wages
ggqqplot(data.all.09male$HRLYEARN, title = "Males 2009")

ggqqplot(data.all.19male$HRLYEARN, title = "Males 2019")

ggqqplot(data.all.09fem$HRLYEARN, title = "Females 2009")

ggqqplot(data.all.19male$HRLYEARN, title = "Females 2019")

# KOLMOGOROV-SMIRNOV ###########################################################
# Usual Hours Worked
ks.test(data.all.09male$UTOTHRS, "pnorm", mean = mean(data.all.09male$UTOTHRS), sd = sd(data.all.09male$UTOTHRS))
## Warning in ks.test(data.all.09male$UTOTHRS, "pnorm", mean = mean(data.all.
## 09male$UTOTHRS), : ties should not be present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  data.all.09male$UTOTHRS
## D = 0.266, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(data.all.19male$UTOTHRS, "pnorm", mean = mean(data.all.19male$UTOTHRS), sd = sd(data.all.19male$UTOTHRS))
## Warning in ks.test(data.all.19male$UTOTHRS, "pnorm", mean = mean(data.all.
## 19male$UTOTHRS), : ties should not be present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  data.all.19male$UTOTHRS
## D = 0.26344, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(data.all.09fem$UTOTHRS, "pnorm", mean = mean(data.all.09fem$UTOTHRS), sd = sd(data.all.09fem$UTOTHRS))
## Warning in ks.test(data.all.09fem$UTOTHRS, "pnorm", mean = mean(data.all.
## 09fem$UTOTHRS), : ties should not be present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  data.all.09fem$UTOTHRS
## D = 0.21438, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(data.all.19fem$UTOTHRS, "pnorm", mean = mean(data.all.19fem$UTOTHRS), sd = sd(data.all.19fem$UTOTHRS))
## Warning in ks.test(data.all.19fem$UTOTHRS, "pnorm", mean = mean(data.all.
## 19fem$UTOTHRS), : ties should not be present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  data.all.19fem$UTOTHRS
## D = 0.21143, p-value < 2.2e-16
## alternative hypothesis: two-sided
# Tenure
ks.test(data.all.09male$TENURE, "pnorm", mean = mean(data.all.09male$TENURE), sd = sd(data.all.09male$TENURE))
## Warning in ks.test(data.all.09male$TENURE, "pnorm", mean = mean(data.all.
## 09male$TENURE), : ties should not be present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  data.all.09male$TENURE
## D = 0.16707, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(data.all.19male$TENURE, "pnorm", mean = mean(data.all.19male$TENURE), sd = sd(data.all.19male$TENURE))
## Warning in ks.test(data.all.19male$TENURE, "pnorm", mean = mean(data.all.
## 19male$TENURE), : ties should not be present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  data.all.19male$TENURE
## D = 0.15484, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(data.all.09fem$TENURE, "pnorm", mean = mean(data.all.09fem$TENURE), sd = sd(data.all.09fem$TENURE))
## Warning in ks.test(data.all.09fem$TENURE, "pnorm", mean = mean(data.all.
## 09fem$TENURE), : ties should not be present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  data.all.09fem$TENURE
## D = 0.16497, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(data.all.19fem$TENURE, "pnorm", mean = mean(data.all.19fem$TENURE), sd = sd(data.all.19fem$TENURE))
## Warning in ks.test(data.all.19fem$TENURE, "pnorm", mean = mean(data.all.
## 19fem$TENURE), : ties should not be present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  data.all.19fem$TENURE
## D = 0.1523, p-value < 2.2e-16
## alternative hypothesis: two-sided
# Wage
ks.test(data.all.09male$HRLYEARN, "pnorm", mean = mean(data.all.09male$HRLYEARN), sd = sd(data.all.09male$HRLYEARN))
## Warning in ks.test(data.all.09male$HRLYEARN, "pnorm", mean = mean(data.all.
## 09male$HRLYEARN), : ties should not be present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  data.all.09male$HRLYEARN
## D = 0.098075, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(data.all.19male$HRLYEARN, "pnorm", mean = mean(data.all.19male$HRLYEARN), sd = sd(data.all.19male$HRLYEARN))
## Warning in ks.test(data.all.19male$HRLYEARN, "pnorm", mean = mean(data.all.
## 19male$HRLYEARN), : ties should not be present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  data.all.19male$HRLYEARN
## D = 0.10788, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(data.all.09fem$HRLYEARN, "pnorm", mean = mean(data.all.09fem$HRLYEARN), sd = sd(data.all.09fem$HRLYEARN))
## Warning in ks.test(data.all.09fem$HRLYEARN, "pnorm", mean = mean(data.all.
## 09fem$HRLYEARN), : ties should not be present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  data.all.09fem$HRLYEARN
## D = 0.11895, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(data.all.19fem$HRLYEARN, "pnorm", mean = mean(data.all.19fem$HRLYEARN), sd = sd(data.all.19fem$HRLYEARN))
## Warning in ks.test(data.all.19fem$HRLYEARN, "pnorm", mean = mean(data.all.
## 19fem$HRLYEARN), : ties should not be present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  data.all.19fem$HRLYEARN
## D = 0.12557, p-value < 2.2e-16
## alternative hypothesis: two-sided
# Pearson (Only numeric)
round(cor(data.all.09male[num.vars]), 2)
##          UTOTHRS TENURE HRLYEARN
## UTOTHRS     1.00   0.11     0.14
## TENURE      0.11   1.00     0.30
## HRLYEARN    0.14   0.30     1.00
round(cor(data.all.19male[num.vars]), 2)
##          UTOTHRS TENURE HRLYEARN
## UTOTHRS     1.00   0.12     0.17
## TENURE      0.12   1.00     0.30
## HRLYEARN    0.17   0.30     1.00
round(cor(data.all.09fem[num.vars]), 2)
##          UTOTHRS TENURE HRLYEARN
## UTOTHRS     1.00   0.18     0.20
## TENURE      0.18   1.00     0.35
## HRLYEARN    0.20   0.35     1.00
round(cor(data.all.19fem[num.vars]), 2)
##          UTOTHRS TENURE HRLYEARN
## UTOTHRS     1.00   0.14     0.19
## TENURE      0.14   1.00     0.34
## HRLYEARN    0.19   0.34     1.00
# Spearmen (Numeric and Ordinal)
ord.vars <- c("AGE_12", "EDUC", "ESTSIZE", "FIRMSIZE", "AGYOWNK")
ord <- lapply(data.all[ord.vars], as.numeric)
ord.num <- cbind(data.all[num.vars], ord)
ord.num <- cbind(data.all[c("SURVYEAR", "SEX")], ord.num)
round(cor(ord.num[ord.num$SURVYEAR == 2009 & ord.num$SEX == "Male", 3:10], method="spearman"), 2)
##          UTOTHRS TENURE HRLYEARN AGE_12  EDUC ESTSIZE FIRMSIZE AGYOWNK
## UTOTHRS     1.00   0.10     0.14   0.13 -0.01   -0.04    -0.10   -0.13
## TENURE      0.10   1.00     0.37   0.47  0.09    0.20     0.19   -0.14
## HRLYEARN    0.14   0.37     1.00   0.31  0.37    0.27     0.24   -0.24
## AGE_12      0.13   0.47     0.31   1.00  0.10    0.09     0.06   -0.04
## EDUC       -0.01   0.09     0.37   0.10  1.00    0.16     0.15   -0.16
## ESTSIZE    -0.04   0.20     0.27   0.09  0.16    1.00     0.60   -0.07
## FIRMSIZE   -0.10   0.19     0.24   0.06  0.15    0.60     1.00   -0.05
## AGYOWNK    -0.13  -0.14    -0.24  -0.04 -0.16   -0.07    -0.05    1.00
round(cor(ord.num[ord.num$SURVYEAR == 2019 & ord.num$SEX == "Male", 3:10], method="spearman"), 2)
##          UTOTHRS TENURE HRLYEARN AGE_12  EDUC ESTSIZE FIRMSIZE AGYOWNK
## UTOTHRS     1.00   0.11     0.18   0.11 -0.02    0.02    -0.06   -0.12
## TENURE      0.11   1.00     0.36   0.47  0.10    0.17     0.18   -0.12
## HRLYEARN    0.18   0.36     1.00   0.26  0.39    0.28     0.25   -0.26
## AGE_12      0.11   0.47     0.26   1.00  0.08    0.07     0.04   -0.02
## EDUC       -0.02   0.10     0.39   0.08  1.00    0.18     0.18   -0.17
## ESTSIZE     0.02   0.17     0.28   0.07  0.18    1.00     0.61   -0.09
## FIRMSIZE   -0.06   0.18     0.25   0.04  0.18    0.61     1.00   -0.07
## AGYOWNK    -0.12  -0.12    -0.26  -0.02 -0.17   -0.09    -0.07    1.00
round(cor(ord.num[ord.num$SURVYEAR == 2009 & ord.num$SEX == "Female", 3:10], method="spearman"), 2)
##          UTOTHRS TENURE HRLYEARN AGE_12  EDUC ESTSIZE FIRMSIZE AGYOWNK
## UTOTHRS     1.00   0.16     0.22   0.12  0.14    0.10     0.04   -0.05
## TENURE      0.16   1.00     0.42   0.51  0.11    0.20     0.16   -0.08
## HRLYEARN    0.22   0.42     1.00   0.27  0.48    0.34     0.26   -0.17
## AGE_12      0.12   0.51     0.27   1.00  0.05    0.09     0.04    0.05
## EDUC        0.14   0.11     0.48   0.05  1.00    0.16     0.13   -0.15
## ESTSIZE     0.10   0.20     0.34   0.09  0.16    1.00     0.55   -0.05
## FIRMSIZE    0.04   0.16     0.26   0.04  0.13    0.55     1.00   -0.04
## AGYOWNK    -0.05  -0.08    -0.17   0.05 -0.15   -0.05    -0.04    1.00
round(cor(ord.num[ord.num$SURVYEAR == 2019 & ord.num$SEX == "Female", 3:10], method="spearman"), 2)
##          UTOTHRS TENURE HRLYEARN AGE_12  EDUC ESTSIZE FIRMSIZE AGYOWNK
## UTOTHRS     1.00   0.13     0.21   0.09  0.14    0.11     0.04   -0.05
## TENURE      0.13   1.00     0.40   0.50  0.10    0.19     0.18   -0.07
## HRLYEARN    0.21   0.40     1.00   0.20  0.48    0.33     0.27   -0.20
## AGE_12      0.09   0.50     0.20   1.00  0.01    0.05     0.01    0.09
## EDUC        0.14   0.10     0.48   0.01  1.00    0.18     0.14   -0.19
## ESTSIZE     0.11   0.19     0.33   0.05  0.18    1.00     0.58   -0.06
## FIRMSIZE    0.04   0.18     0.27   0.01  0.14    0.58     1.00   -0.05
## AGYOWNK    -0.05  -0.07    -0.20   0.09 -0.19   -0.06    -0.05    1.00

Hourly Wages Relationship to Other Variables

  • Hourly Wages vs Numeric Variables: Scatterplot Analysis
summary(data.all[num.vars])
##     UTOTHRS          TENURE          HRLYEARN     
##  Min.   : 0.40   Min.   :  1.00   Min.   :  2.00  
##  1st Qu.:35.00   1st Qu.: 15.00   1st Qu.: 15.00  
##  Median :40.00   Median : 53.00   Median : 21.00  
##  Mean   :36.23   Mean   : 86.02   Mean   : 24.37  
##  3rd Qu.:40.00   3rd Qu.:143.00   3rd Qu.: 30.77  
##  Max.   :99.00   Max.   :240.00   Max.   :115.38
# HW v. USUAL HOURS WORKED PER WEEK ############################################
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(4.5, 4.5, 1.0, 1.0))
plot(data.all.09male$UTOTHRS
     , data.all.09male$HRLYEARN
     , xlim = c(0,120)
     , ylim = c(0,120)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , xlab="Usual Hours Worked per Week (UTOTHRS)"
     , ylab="Hourly Wage (HRLYEARN)"
     )
lines(lowess(data.all.09male$UTOTHRS, data.all.09male$HRLYEARN), col="red", lwd = 8)

# Males 2019
plot(data.all.19male$UTOTHRS
     , data.all.19male$HRLYEARN
     , xlim = c(0,120)
     , ylim = c(0,120)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , xlab="Usual Hours Worked per Week (UTOTHRS)"
     , ylab="Hourly Wage (HRLYEARN)"
)
lines(lowess(data.all.19male$UTOTHRS, data.all.19male$HRLYEARN), col="red", lwd = 8)

# Females 2009
plot(data.all.09fem$UTOTHRS
     , data.all.09fem$HRLYEARN
     , xlim = c(0,120)
     , ylim = c(0,120)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , xlab="Usual Hours Worked per Week (UTOTHRS)"
     , ylab="Hourly Wage (HRLYEARN)"
)
lines(lowess(data.all.09fem$UTOTHRS, data.all.09fem$HRLYEARN), col="red", lwd = 8)

# Females 2019
plot(data.all.19fem$UTOTHRS
     , data.all.19fem$HRLYEARN
     , xlim = c(0,120)
     , ylim = c(0,120)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , xlab="Usual Hours Worked per Week (UTOTHRS)"
     , ylab="Hourly Wage (HRLYEARN)"
)
lines(lowess(data.all.19fem$UTOTHRS, data.all.19fem$HRLYEARN), col="red", lwd = 8)

# HW v. TENURE #################################################################
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(4.5, 4.5, 1.0, 1.0))
plot(data.all.09male$TENURE
     , data.all.09male$HRLYEARN
     , xlim = c(0,240)
     , ylim = c(0,240)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , xlab="Tenure in Months"
     , ylab="Hourly Wage (HRLYEARN)"
)
lines(lowess(data.all.09male$TENURE, data.all.09male$HRLYEARN), col="red", lwd = 8)

# Males 2019
plot(data.all.19male$TENURE
     , data.all.19male$HRLYEARN
     , xlim = c(0,240)
     , ylim = c(0,240)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , xlab="Tenure in Months"
     , ylab="Hourly Wage (HRLYEARN)"
)
lines(lowess(data.all.19male$TENURE, data.all.19male$HRLYEARN), col="red", lwd = 8)

# Females 2009
plot(data.all.09fem$TENURE
     , data.all.09fem$HRLYEARN
     , xlim = c(0,240)
     , ylim = c(0,240)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , xlab="Tenure in Months"
     , ylab="Hourly Wage (HRLYEARN)"
)
lines(lowess(data.all.09fem$TENURE, data.all.09fem$HRLYEARN), col="red", lwd = 8)

# Females 2019
plot(data.all.19fem$TENURE
     , data.all.19fem$HRLYEARN
     , xlim = c(0,240)
     , ylim = c(0,240)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , xlab="Tenure in Months"
     , ylab="Hourly Wage (HRLYEARN)"
)
lines(lowess(data.all.19fem$TENURE, data.all.19fem$HRLYEARN), col="red", lwd = 8)

  • Hourly Wages vs. Ordinal Variables: Boxplot Analysis
summary(data.all[ord.vars])
##      AGE_12                                        EDUC          ESTSIZE     
##  45-49  :12952   0 to 8 years                        : 2003   <20    :36269  
##  50-54  :12380   Some high school                    : 9783   20-99  :35799  
##  40-44  :11782   High school graduate                :21594   100-500:20604  
##  35-39  :11475   Some postsecondary                  : 7400   >500   :12816  
##  30-34  :11045   Postsecondary certificate or diploma:39814                  
##  25-29  :10937   Bachelor's degree                   :17378                  
##  (Other):34917   Above bachelor's degree             : 7516                  
##     FIRMSIZE       AGYOWNK     
##  <20    :20662   >6    :13510  
##  20-99  :17463   6-12  :11724  
##  100-500:15467   13-17 : 8270  
##  >500   :51896   18-24 : 6840  
##                  >24|NC:65144  
##                                
## 
# HW v. AGE GROUPS #############################################################
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(8, 4.5, 1.0, 1.0))
plot(data.all.09male$AGE_12
     , data.all.09male$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "slategray2"
     , xlab=""
     , ylab="Hourly Wage"
)
mtext("Age Groups", side=1, line=5.8, cex =1.8)

# Males 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(8, 4.5, 1.0, 1.0))
plot(data.all.19male$AGE_12
     , data.all.19male$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "royalblue"
     , xlab=""
     , ylab="Hourly Wage"
)
mtext("Age Groups", side=1, line=5.8, cex =1.8)

# Females 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(8, 4.5, 1.0, 1.0))
plot(data.all.09fem$AGE_12
     , data.all.09fem$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "moccasin"
     , xlab=""
     , ylab="Hourly Wage"
)
mtext("Age Groups", side=1, line=5.8, cex =1.8)

# Females 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(8, 4.5, 1.0, 1.0))
plot(data.all.19fem$AGE_12
     , data.all.19fem$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "darkorange"
     , xlab=""
     , ylab="Hourly Wage"
)
mtext("Age Groups", side=1, line=5.8, cex =1.8)

# HW v. EDUCATION ##############################################################
# Males 2009
summary(data.all$EDUCshort)
##     No.HS   Some.HS   HS.grad Some.Post Post.cert  Bachelor   Above.B 
##      2003      9783     21594      7400     39814     17378      7516
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(10.5, 5.5, 1.0, 1.0))
plot(data.all.09male$EDUCshort
     , data.all.09male$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "slategray2"
     , xlab=""
     , ylab=""
)
mtext("Highest Education Attainment", side=1, line=9, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)

# Males 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(10.5, 5.5, 1.0, 1.0))
plot(data.all.19male$EDUCshort
     , data.all.19male$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "royalblue"
     , xlab=""
     , ylab=""
)
mtext("Highest Education Attainment", side=1, line=9, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)

# Females 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(10.5, 5.5, 1.0, 1.0))
plot(data.all.09fem$EDUCshort
     , data.all.09fem$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "moccasin"
     , xlab=""
     , ylab=""
)
mtext("Highest Education Attainment", side=1, line=9, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)

# Females 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(10.5, 5.5, 1.0, 1.0))
plot(data.all.19fem$EDUCshort
     , data.all.19fem$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "darkorange"
     , xlab=""
     , ylab=""
)
mtext("Highest Education Attainment", side=1, line=9, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)

# HW v. ESTABLISHMENT SIZE #####################################################
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.09male$ESTSIZE
     , data.all.09male$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "slategray2"
     , xlab=""
     , ylab=""
)
# mtext("Establishment Size", side=1, line=7.5, cex =1.8)
# mtext("(Number of Employees)", side=1, line=8.8, cex =1.4)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)

# Males 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.19male$ESTSIZE
     , data.all.19male$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "royalblue"
     , xlab=""
     , ylab=""
)

# mtext("Establishment Size", side=1, line=7.5, cex =1.8)
# mtext("(Number of Employees)", side=1, line=8.8, cex =1.4)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.09fem$ESTSIZE
     , data.all.09fem$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "moccasin"
     , xlab=""
     , ylab=""
)

# mtext("Establishment Size", side=1, line=7.5, cex =1.8)
# mtext("(Number of Employees)", side=1, line=8.8, cex =1.4)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.09fem$ESTSIZE
     , data.all.09fem$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "darkorange"
     , xlab=""
     , ylab=""
)

# mtext("Establishment Size", side=1, line=7.5, cex =1.8)
# mtext("(Number of Employees)", side=1, line=8.8, cex =1.4)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)

# HW v. FIRM SIZE #####################################################
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.09male$FIRMSIZE
     , data.all.09male$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "slategray2"
     , xlab=""
     , ylab=""
)
# mtext("Firm Size", side=1, line=7.5, cex =1.8)
# mtext("(Number of Employees)", side=1, line=8.8, cex =1.4)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)

# Males 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.19male$FIRMSIZE
     , data.all.19male$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "royalblue"
     , xlab=""
     , ylab=""
)

# mtext("Firm Size", side=1, line=7.5, cex =1.8)
# mtext("(Number of Employees)", side=1, line=8.8, cex =1.4)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.09fem$FIRMSIZE
     , data.all.09fem$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "moccasin"
     , xlab=""
     , ylab=""
)

# mtext("Firm Size", side=1, line=7.5, cex =1.8)
# mtext("(Number of Employees)", side=1, line=8.8, cex =1.4)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.09fem$FIRMSIZE
     , data.all.09fem$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "darkorange"
     , xlab=""
     , ylab=""
)

# mtext("Firm Size", side=1, line=7.5, cex =1.8)
# mtext("(Number of Employees)", side=1, line=8.8, cex =1.4)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)

# HW v. AGE OF YOUNGEST CHILD ##################################################
summary(data.all[ord.vars])
##      AGE_12                                        EDUC          ESTSIZE     
##  45-49  :12952   0 to 8 years                        : 2003   <20    :36269  
##  50-54  :12380   Some high school                    : 9783   20-99  :35799  
##  40-44  :11782   High school graduate                :21594   100-500:20604  
##  35-39  :11475   Some postsecondary                  : 7400   >500   :12816  
##  30-34  :11045   Postsecondary certificate or diploma:39814                  
##  25-29  :10937   Bachelor's degree                   :17378                  
##  (Other):34917   Above bachelor's degree             : 7516                  
##     FIRMSIZE       AGYOWNK     
##  <20    :20662   >6    :13510  
##  20-99  :17463   6-12  :11724  
##  100-500:15467   13-17 : 8270  
##  >500   :51896   18-24 : 6840  
##                  >24|NC:65144  
##                                
## 
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.09male$AGYOWNK
     , data.all.09male$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "slategray2"
     , xlab=""
     , ylab=""
)
# mtext("Age of Youngest Child", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)

# Males 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.19male$AGYOWNK
     , data.all.19male$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "royalblue"
     , xlab=""
     , ylab=""
)

# mtext("Age of Youngest Child", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.09fem$AGYOWNK
     , data.all.09fem$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "moccasin"
     , xlab=""
     , ylab=""
)

# mtext("Age of Youngest Child", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.09fem$AGYOWNK
     , data.all.09fem$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "darkorange"
     , xlab=""
     , ylab=""
)

# mtext("Age of Youngest Child", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
  • Hourly Wages vs. Nominal Variables: Boxplot Analysis
# HW v. PROVINCE ###############################################################
summary(data.all$PROV)
##    NL   PEI    NS    NB    QC    ON    MB    SK    AB    BC 
##  3515  2871  5197  5311 18664 29816  9332  7513 11177 12092
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(4.5, 5.5, 1.0, 1.0))
prov.order <- with(data.all.09male, reorder(PROV, HRLYEARN, median))
plot(prov.order
     , data.all.09male$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "slategray2"
     , xlab=""
     , ylab=""
)
# mtext("Province", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)

round(tapply(data.all.09male$HRLYEARN, INDEX=data.all.09male$PROV, FUN=median), 2)
##    NL   PEI    NS    NB    QC    ON    MB    SK    AB    BC 
## 18.95 15.80 18.48 17.50 19.98 21.54 18.75 21.00 24.23 23.44
# Males 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(4.5, 5.5, 1.0, 1.0))
prov.order <- with(data.all.19male, reorder(PROV, HRLYEARN, median))
plot(prov.order
     , data.all.19male$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "royalblue"
     , xlab=""
     , ylab=""
)
# mtext("Province", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)

round(tapply(data.all.19male$HRLYEARN, INDEX=data.all.19male$PROV, FUN=median), 2)
##    NL   PEI    NS    NB    QC    ON    MB    SK    AB    BC 
## 25.00 20.00 22.12 21.00 24.90 26.00 24.04 27.00 31.77 28.50
# Females 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(4.5, 5.5, 1.0, 1.0))
prov.order <- with(data.all.09fem, reorder(PROV, HRLYEARN, median))
plot(prov.order
     , data.all.09fem$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "moccasin"
     , xlab=""
     , ylab=""
)
# mtext("Province", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)

round(tapply(data.all.09fem$HRLYEARN, INDEX=data.all.09fem$PROV, FUN=median), 2)
##    NL   PEI    NS    NB    QC    ON    MB    SK    AB    BC 
## 15.00 14.42 15.21 15.00 17.00 18.00 16.31 17.62 18.75 18.20
# Females 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(4.5, 5.5, 1.0, 1.0))
prov.order <- with(data.all.19fem, reorder(PROV, HRLYEARN, median))
plot(prov.order
     , data.all.19fem$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "darkorange"
     , xlab=""
     , ylab=""
)
# mtext("Province", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)

round(tapply(data.all.19fem$HRLYEARN, INDEX=data.all.19fem$PROV, FUN=median), 2)
##    NL   PEI    NS    NB    QC    ON    MB    SK    AB    BC 
## 20.34 20.00 19.00 19.98 22.00 23.08 20.60 23.00 24.00 23.00
# HW v. MARITAL STATUS #########################################################
summary(data.all$MARSTAT)
##    Married Common-law    Widowed  Separated   Divorced Single, NM 
##      50075      16182       1175       2986       4835      30235
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(10, 5.5, 1.0, 1.0))
ms.order <- with(data.all.09male, reorder(MARSTAT, HRLYEARN, median))
plot(ms.order
     , data.all.09male$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "slategray2"
     , xlab=""
     , ylab=""
)
# mtext("Marital Status", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)

round(tapply(data.all.09male$HRLYEARN, INDEX=data.all.09male$MARSTAT, FUN=median), 2)
##    Married Common-law    Widowed  Separated   Divorced Single, NM 
##      24.00      21.63      20.00      22.00      22.00      15.00
# Males 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(10, 5.5, 1.0, 1.0))
ms.order <- with(data.all.19male, reorder(MARSTAT, HRLYEARN, median))
plot(ms.order
     , data.all.19male$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "royalblue"
     , xlab=""
     , ylab=""
)
# mtext("Marital Status", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)

round(tapply(data.all.19male$HRLYEARN, INDEX=data.all.19male$MARSTAT, FUN=median), 2)
##    Married Common-law    Widowed  Separated   Divorced Single, NM 
##      30.00      27.20      23.17      28.00      27.50      19.69
# Females 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(10, 5.5, 1.0, 1.0))
ms.order <- with(data.all.09fem, reorder(MARSTAT, HRLYEARN, median))
plot(ms.order
     , data.all.09fem$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "moccasin"
     , xlab=""
     , ylab=""
)
# mtext("Marital Status", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)

round(tapply(data.all.09fem$HRLYEARN, INDEX=data.all.09fem$MARSTAT, FUN=median), 2)
##    Married Common-law    Widowed  Separated   Divorced Single, NM 
##      19.23      17.00      16.45      18.00      19.23      12.51
# Females 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(10, 5.5, 1.0, 1.0))
ms.order <- with(data.all.19fem, reorder(MARSTAT, HRLYEARN, median))
plot(ms.order
     , data.all.19fem$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "darkorange"
     , xlab=""
     , ylab=""
)
# mtext("Marital Status", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)

round(tapply(data.all.19fem$HRLYEARN, INDEX=data.all.19fem$MARSTAT, FUN=median), 2)
##    Married Common-law    Widowed  Separated   Divorced Single, NM 
##      24.92      23.00      20.00      24.00      22.67      17.95
# HW v. FULL TIME OR PART TIME #################################################
summary(data.all$FTPTMAIN)
## Full-time Part-time 
##     86157     19331
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
ftpt.order <- with(data.all.09male, reorder(FTPTMAIN, HRLYEARN, median))
plot(ftpt.order
     , data.all.09male$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "slategray2"
     , xlab=""
     , ylab=""
)
# mtext("Full Time or Part Time Status", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)

# Males 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
ftpt.order <- with(data.all.19male, reorder(FTPTMAIN, HRLYEARN, median))
plot(ftpt.order
     , data.all.19male$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "royalblue"
     , xlab=""
     , ylab=""
)

# mtext("Full Time or Part Time Status", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
ftpt.order <- with(data.all.09fem, reorder(FTPTMAIN, HRLYEARN, median))
plot(ftpt.order
     , data.all.09fem$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "moccasin"
     , xlab=""
     , ylab=""
)

# mtext("Full Time or Part Time Status", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
ftpt.order <- with(data.all.19fem, reorder(FTPTMAIN, HRLYEARN, median))
plot(ftpt.order
     , data.all.19fem$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "darkorange"
     , xlab=""
     , ylab=""
)
# mtext("Full Time or Part Time Status", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)

# Medians ALL GROUPS
round(tapply(data.all.09male$HRLYEARN, INDEX=data.all.09male$FTPTMAIN, FUN=median), 2)
## Full-time Part-time 
##      22.0      10.5
round(tapply(data.all.19male$HRLYEARN, INDEX=data.all.19male$FTPTMAIN, FUN=median), 2)
## Full-time Part-time 
##     27.46     15.00
round(tapply(data.all.09fem$HRLYEARN, INDEX=data.all.09fem$FTPTMAIN, FUN=median), 2)
## Full-time Part-time 
##        19        12
round(tapply(data.all.19fem$HRLYEARN, INDEX=data.all.19fem$FTPTMAIN, FUN=median), 2)
## Full-time Part-time 
##     24.06     16.00
# HW v. JOB PERMANENCY #################################################
summary(data.all$FTPTMAIN)
## Full-time Part-time 
##     86157     19331
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(12, 5.5, 1.0, 18.0))
per.order <- with(data.all.09male, reorder(PERMTEMP, HRLYEARN, median))
plot(per.order
     , data.all.09male$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "slategray2"
     , xlab=""
     , ylab=""
)
# mtext("Job Permanency", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)

# Males 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
per.order <- with(data.all.19male, reorder(PERMTEMP, HRLYEARN, median))
plot(per.order
     , data.all.19male$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "royalblue"
     , xlab=""
     , ylab=""
)

# mtext("Full Time or Part Time Status", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
per.order <- with(data.all.09fem, reorder(PERMTEMP, HRLYEARN, median))
plot(per.order
     , data.all.09fem$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "moccasin"
     , xlab=""
     , ylab=""
)

# mtext("Full Time or Part Time Status", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
per.order <- with(data.all.19fem, reorder(PERMTEMP, HRLYEARN, median))
plot(per.order
     , data.all.19fem$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "darkorange"
     , xlab=""
     , ylab=""
)

# mtext("Full Time or Part Time Status", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Medians ALL GROUPS
round(tapply(data.all.09male$HRLYEARN, INDEX=data.all.09male$PERMTEMP, FUN=median), 2)
##      Permanent   Temp. season Temp. contract   Temp. casual 
##          21.63          15.00          19.23          12.00
round(tapply(data.all.19male$HRLYEARN, INDEX=data.all.19male$PERMTEMP, FUN=median), 2)
##      Permanent   Temp. season Temp. contract   Temp. casual 
##          26.92          19.50          24.93          15.54
round(tapply(data.all.09fem$HRLYEARN, INDEX=data.all.09fem$PERMTEMP, FUN=median), 2)
##      Permanent   Temp. season Temp. contract   Temp. casual 
##          17.85          11.00          17.21          12.50
round(tapply(data.all.19fem$HRLYEARN, INDEX=data.all.19fem$PERMTEMP, FUN=median), 2)
##      Permanent   Temp. season Temp. contract   Temp. casual 
##          23.00          15.00          22.44          16.25
# HW v. ECONOMIC FAMILY ########################################################
summary(data.all$EFAMTYPE)
##    Ind HWDENC HWDE17 HWDE24 HWSHNC HWSH17 HWSH24 HWSWNC HWSW17 HWSW24 HWNENC 
##  16842  22136  28911   7892   4053   4009    986   3423   1355    712    749 
## HWNE17 HWNE24  SPE17  SPE24  SPN17  SPN24  Other 
##    109    181   4991   2024    267    189   6659
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(6, 5.5, 1.0, 1.0))
efa.order <- with(data.all.09male, reorder(EFAMTYPE, HRLYEARN, median))
plot(efa.order
     , data.all.09male$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.3
     , las = 2
     , col = "slategray2"
     , xlab=""
     , ylab=""
)
# mtext("Type of Economic Family", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)

# Males 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
efa.order <- with(data.all.19male, reorder(EFAMTYPE, HRLYEARN, median))
plot(efa.order
     , data.all.19male$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "royalblue"
     , xlab=""
     , ylab=""
)

# mtext("Type of Economic Family", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
efa.order <- with(data.all.09fem, reorder(EFAMTYPE, HRLYEARN, median))
plot(efa.order
     , data.all.09fem$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "moccasin"
     , xlab=""
     , ylab=""
)

# mtext("Type of Economic Family", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
efa.order <- with(data.all.19fem, reorder(EFAMTYPE, HRLYEARN, median))
plot(efa.order
     , data.all.19fem$HRLYEARN
     , ylim = c(0,115)
     , cex.lab = 1.8
     , cex.axis = 1.8
     , las = 2
     , col = "darkorange"
     , xlab=""
     , ylab=""
)

# mtext("Type of Economic Family", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Medians ALL GROUPS
round(tapply(data.all.09male$HRLYEARN, INDEX=data.all.09male$EFAMTYPE, FUN=median), 2)
##    Ind HWDENC HWDE17 HWDE24 HWSHNC HWSH17 HWSH24 HWSWNC HWSW17 HWSW24 HWNENC 
##  20.00  22.00  23.00  18.86  20.48  21.25  19.56  18.00  10.00  12.00  16.86 
## HWNE17 HWNE24  SPE17  SPE24  SPN17  SPN24  Other 
##  10.00  12.00  17.52  14.96  10.00  12.00  18.00
round(tapply(data.all.19male$HRLYEARN, INDEX=data.all.19male$EFAMTYPE, FUN=median), 2)
##    Ind HWDENC HWDE17 HWDE24 HWSHNC HWSH17 HWSH24 HWSWNC HWSW17 HWSW24 HWNENC 
##  24.75  27.00  29.74  24.00  25.26  28.85  25.00  21.50  13.85  17.00  21.37 
## HWNE17 HWNE24  SPE17  SPE24  SPN17  SPN24  Other 
##  14.50  19.00  24.00  18.50  15.00  15.07  21.00
round(tapply(data.all.09fem$HRLYEARN, INDEX=data.all.09fem$EFAMTYPE, FUN=median), 2)
##    Ind HWDENC HWDE17 HWDE24 HWSHNC HWSH17 HWSH24 HWSWNC HWSW17 HWSW24 HWNENC 
##  17.80  18.40  18.00  16.00  14.75   9.60  10.50  17.00  16.00  15.71  16.00 
## HWNE17 HWNE24  SPE17  SPE24  SPN17  SPN24  Other 
##   9.50  10.00  16.05  15.68   9.38  10.00  16.00
round(tapply(data.all.19fem$HRLYEARN, INDEX=data.all.19fem$EFAMTYPE, FUN=median), 2)
##    Ind HWDENC HWDE17 HWDE24 HWSHNC HWSH17 HWSH24 HWSWNC HWSW17 HWSW24 HWNENC 
##  22.20  23.00  24.50  21.00  20.00  14.13  15.00  21.00  20.00  21.20  21.24 
## HWNE17 HWNE24  SPE17  SPE24  SPN17  SPN24  Other 
##  13.82  15.00  20.00  19.00  14.36  15.50  19.50

STEP 3: Hourly Wages by Sector, Industry, and Occupation

Highest hourly wages by year and gender, among sectors, industries, occupations

# RESULTS BY GROUP #############################################################
# INDUSTRY #####################################################################
# Males 2009
ind.order <- with(data.all.09male, reorder(NAICS_18short, HRLYEARN, median))
par(mar=c(15,5,2,1))
wage.ind.09male <- boxplot(HRLYEARN ~ ind.order
                           , data = data.all.09male
                           , boxwex = 0.4
                           , ylim = c(0, 120)
                           , las = 2
                           , cex.axis = 1.0
                           , col = "slategray2"
                           , xlab = ""
                           , ylab = "Hourly Wages (HRLYEARN)"
                           , main = "Males 2009 - Hourly Wage by Industry")
mtext("Industry (NAICS_18)", side=1, line=4.3)

rownames(wage.ind.09male$stats) <- c("Lower Fence", "Q1", "Median", "Q3", "Upper Fence")
colnames(wage.ind.09male$stats) <- c(levels(ind.order))
wage.ind.09male$stats # Boxplot Summary
##             AcFood  Agri Rtail  Mngt Other   Info ManuN Whole  Trans Health
## Lower Fence   5.00  2.14  3.79  3.33  3.13  3.380  5.45  4.25  3.210  3.300
## Q1            9.35 10.00 10.00 11.00 13.70 13.000 15.00 15.00 16.000 16.465
## Median       11.00 13.55 14.00 14.00 18.90 19.815 20.00 20.00 20.510 21.450
## Q3           15.00 17.31 20.14 20.00 25.00 28.850 28.00 27.47 25.985 31.000
## Upper Fence  23.08 28.00 35.20 33.00 41.83 51.920 47.16 46.15 40.870 52.200
##             Const ManuD Finan Fores  Educa ProSc PubAd Utils
## Lower Fence  5.10  3.48  3.75  4.44  3.610  5.00  3.13  5.13
## Q1          17.00 17.00 15.34 19.78 19.700 20.19 22.03 23.50
## Median      22.00 22.00 22.00 28.00 28.280 28.29 30.00 30.64
## Q3          29.87 29.51 33.65 34.62 37.555 38.74 37.91 38.00
## Upper Fence 49.15 48.08 60.00 55.77 64.100 65.93 61.54 58.24
wage.ind.09male$n # Sample size
##  [1] 1429  497 2994  992  929 1106 1776 1258 1860 1067 3209 2605  971 1330 1460
## [16] 1086 1880  493
table(wage.ind.09male$group) # Total Outliers
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##  80  14 136  64  33  23  60  49  96  29  41  76  30  49  19  38  39  10
kruskal.test(HRLYEARN ~ NAICS_18short, data = data.all.09male) # Diff. among groups
## 
##  Kruskal-Wallis rank sum test
## 
## data:  HRLYEARN by NAICS_18short
## Kruskal-Wallis chi-squared = 6107.8, df = 17, p-value < 2.2e-16
# dunnTest(HRLYEARN ~ NAICS_18short, data = data.all.09male, method = "bonferroni") # Post Hoc
# Males 2019
ind.order <- with(data.all.19male, reorder(NAICS_18short, HRLYEARN, median))
par(mar=c(15,5,2,1))
wage.ind.19male <- boxplot(HRLYEARN ~ ind.order
                           , data = data.all.19male
                           , boxwex = 0.4
                           , ylim = c(0, 120)
                           , las = 2
                           , cex.axis = 1.0
                           , col = "royalblue"
                           , xlab = ""
                           , ylab = "Hourly Wages (HRLYEARN)"
                           , main = "Males 2019 - Hourly Wage by Industry")
mtext("Industry (NAICS_18)", side=1, line=4.3)

rownames(wage.ind.19male$stats) <- c("Lower Fence", "Q1", "Median", "Q3", "Upper Fence")
colnames(wage.ind.19male$stats) <- c(levels(ind.order))
wage.ind.19male$stats # Boxplot Summary
##             AcFood  Rtail  Agri  Mngt  Info  Other ManuN Trans ManuD Whole
## Lower Fence   6.92  3.040  3.30  4.56  4.62  3.000  3.53  5.00  5.29  9.62
## Q1           13.15 14.000 15.00 15.00 16.00 18.000 18.50 19.35 20.00 20.00
## Median       15.00 16.750 18.47 18.50 23.05 23.080 25.00 25.00 26.00 26.00
## Q3           17.61 24.855 23.08 25.00 34.07 30.965 34.00 32.00 35.00 36.00
## Upper Fence  24.18 41.080 35.00 40.00 60.22 50.000 56.54 50.96 57.50 58.50
##             Health  Const Finan ProSc Educa Fores PubAd  Utils
## Lower Fence   4.81  8.170  3.53  5.77  3.25  3.21  5.13 14.000
## Q1           20.00 22.000 21.00 25.00 24.22 27.78 28.00 35.000
## Median       26.25 28.745 29.77 35.00 36.06 37.00 37.00 43.475
## Q3           37.90 37.000 42.31 46.67 48.08 49.04 47.00 52.000
## Upper Fence  63.37 58.000 72.92 78.85 82.05 80.77 75.00 76.920
wage.ind.19male$n # Sample size
##  [1] 1336 2792  457  955  981  819 1489 1872 2207 1108 1205 3264 1014 1326 1306
## [16] 1242 1746  380
table(wage.ind.19male$group) # Total Outliers
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
## 118 155  33  53  21  36  61  71  66  40  29  60  39  37  19  17  18   7
kruskal.test(HRLYEARN ~ NAICS_18short, data = data.all.19male) # Diff. among groups
## 
##  Kruskal-Wallis rank sum test
## 
## data:  HRLYEARN by NAICS_18short
## Kruskal-Wallis chi-squared = 6604, df = 17, p-value < 2.2e-16
# dunnTest(HRLYEARN ~ NAICS_18short, data = data.all.19male, method = "bonferroni") # Post Hoc
# Females 2009
ind.order <- with(data.all.09fem, reorder(NAICS_18short, HRLYEARN, median))
par(mar=c(15,5,2,1))
wage.ind.09fem <- boxplot(HRLYEARN ~ ind.order
                           , data = data.all.09fem
                           , boxwex = 0.4
                           , ylim = c(0, 120)
                           , las = 2
                           , cex.axis = 1.0
                           , col = "moccasin"
                           , xlab = ""
                           , ylab = "Hourly Wages (HRLYEARN)"
                           , main = "Females 2009 - Hourly Wage by Industry")
mtext("Industry (NAICS_18)", side=1, line=4.3)

rownames(wage.ind.09fem$stats) <- c("Lower Fence", "Q1", "Median", "Q3", "Upper Fence")
colnames(wage.ind.09fem$stats) <- c(levels(ind.order))
wage.ind.09fem$stats # Boxplot Summary
##             AcFood Rtail  Agri  Mngt ManuN Other  Info Whole Const  ManuD Trans
## Lower Fence   4.77  3.66  4.81  5.03  5.77  2.00  3.55  5.13  3.50  5.490  3.55
## Q1            9.00  9.50  9.70 10.30 11.50 10.53 11.00 13.00 13.85 14.420 13.50
## Median       10.00 10.95 11.00 13.00 15.00 15.00 16.00 16.83 16.92 18.000 18.00
## Q3           13.00 14.50 14.00 17.00 19.35 19.56 23.00 21.63 20.77 24.105 23.12
## Upper Fence  19.00 22.00 20.00 27.00 31.00 32.97 40.87 34.00 30.29 38.460 37.33
##             Finan ProSc Health Utils Educa PubAd Fores
## Lower Fence  5.22  3.08   3.50  3.90  3.07  5.49  5.00
## Q1          15.00 15.00  15.87 19.00 18.00 20.00 18.46
## Median      19.23 20.00  20.00 24.16 24.62 25.00 25.25
## Q3          25.00 28.21  29.07 29.78 33.65 31.87 33.65
## Upper Fence 40.00 47.99  48.73 44.00 57.05 49.60 55.27
wage.ind.09fem$n # Sample size
##  [1] 2518 4155  221  821 1007 1127 1079  511  381  640  677 1823 1074 6213  140
## [16] 3114 1878  236
table(wage.ind.09fem$group) # Total Outliers
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
## 178 301  14  54  53  68  30  15  25  31  17  98  37  62   5  47  59   9
kruskal.test(HRLYEARN ~ NAICS_18short, data = data.all.09fem) # Diff. among groups
## 
##  Kruskal-Wallis rank sum test
## 
## data:  HRLYEARN by NAICS_18short
## Kruskal-Wallis chi-squared = 9226.4, df = 17, p-value < 2.2e-16
# dunnTest(HRLYEARN ~ NAICS_18short, data = data.all.09fem, method = "bonferroni") # Post Hoc
# Females 2019
ind.order <- with(data.all.19fem, reorder(NAICS_18short, HRLYEARN, median))
par(mar=c(15,5,2,1))
wage.ind.19fem <- boxplot(HRLYEARN ~ ind.order
                           , data = data.all.19fem
                           , boxwex = 0.4
                           , ylim = c(0, 120)
                           , las = 2
                           , cex.axis = 1.0
                           , col = "darkorange"
                           , xlab = ""
                           , ylab = "Hourly Wages (HRLYEARN)"
                           , main = "Females 2019 - Hourly Wage by Industry")
mtext("Industry (NAICS_18)", side=1, line=4.3)

rownames(wage.ind.19fem$stats) <- c("Lower Fence", "Q1", "Median", "Q3", "Upper Fence")
colnames(wage.ind.19fem$stats) <- c(levels(ind.order))
wage.ind.19fem$stats # Boxplot Summary
##             AcFood Rtail   Agri  Mngt Other ManuN  Info  Trans Whole ManuD
## Lower Fence   7.41  8.25  5.770  5.26  4.81 10.50  5.00  3.070 10.30  6.92
## Q1           13.00 13.50 14.000 14.77 15.00 15.34 15.00 17.465 17.50 18.03
## Median       14.50 15.00 16.000 18.00 19.17 19.75 20.00 21.720 22.09 22.50
## Q3           16.86 19.00 20.875 23.67 27.40 26.00 28.35 27.295 28.85 29.81
## Upper Fence  22.50 27.25 30.000 36.54 45.05 41.03 48.21 42.000 45.67 46.77
##             Const Health  Finan ProSc Educa  PubAd  Fores Utils
## Lower Fence  6.07   3.50  4.360  3.13  3.48  4.810  4.730 16.07
## Q1          18.00  19.00 19.975 20.00 23.00 25.295 23.875 27.85
## Median      23.00  24.04 25.640 26.44 30.00 31.370 34.000 37.00
## Q3          28.90  35.00 34.055 35.49 42.31 41.325 45.095 46.15
## Upper Fence 45.00  59.00 54.950 58.00 71.15 65.000 72.120 72.82
wage.ind.19fem$n # Sample size
##  [1] 2204 3333  252  666  897  819  877  687  482  529  482 6227 1492 1168 3159
## [16] 1804  239  115
table(wage.ind.19fem$group) # Total Outliers
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
## 184 272  17  26  35  57  32  44  23  24  26  60  72  40  43  46   1   2
kruskal.test(HRLYEARN ~ NAICS_18short, data = data.all.19fem) # Diff. among groups
## 
##  Kruskal-Wallis rank sum test
## 
## data:  HRLYEARN by NAICS_18short
## Kruskal-Wallis chi-squared = 7815.6, df = 17, p-value < 2.2e-16
# dunnTest(HRLYEARN ~ NAICS_18short, data = data.all.19fem, method = "bonferroni") # Post Hoc

# RESULTS BY GROUP #############################################################
# OCCUPATION ###################################################################
# Males 2009
occu.order.male09 <- with(data.all.09male, reorder(NOC_10short, HRLYEARN, median))
par(mar=c(16,5,2,1))
wage.occu.09male <- boxplot(HRLYEARN ~ occu.order.male09
                           , data = data.all.09male
                           , boxwex = 0.4
                           , ylim = c(0, 120)
                           , las = 2
                           , cex.axis = 1.0
                           , col = "slategray2"
                           , xlab = ""
                           , ylab = "Hourly Wages (HRLYEARN)"
                           , main = "Males 2009 - Hourly Wage by Occupation")
mtext("Occupation (NOC_10)", side=1, line=4.3)

rownames(wage.occu.09male$stats) <- c("LowFence", "Q1", "Median", "Q3", "UpFence")
colnames(wage.occu.09male$stats) <- c(levels(occu.order.male09))
wage.occu.09male$stats
##          Sales NatAgri ManUtil BusFin ArtCul Trades Health NatASc EduLaw  Mngt
## LowFence  3.13    2.14    6.67   3.13  4.210  3.210   7.00   3.79  3.610  3.48
## Q1       10.00   12.00   15.00  15.00 15.000 16.000  18.50  21.63 20.975 23.08
## Median   14.00   17.00   19.50  20.19 20.295 20.675  25.00  28.85 30.000 33.00
## Q3       20.00   25.18   25.00  28.69 28.850 27.300  34.59  38.41 38.460 45.64
## UpFence  35.00   44.41   40.00  49.04 48.210 44.230  57.69  63.57 64.100 79.47
wage.occu.09male$n #Sample size
##  [1] 5883 1315 2025 2434  376 8232  525 2632 1479 2041
table(wage.occu.09male$group) #Total Outliers
## 
##   1   2   3   4   5   6   7   8   9  10 
## 355  41  46  60   5 108  14  60  34  18
kruskal.test(HRLYEARN ~ NOC_10, data = data.all.09male) # Diff. among groups
## 
##  Kruskal-Wallis rank sum test
## 
## data:  HRLYEARN by NOC_10
## Kruskal-Wallis chi-squared = 5740.1, df = 9, p-value < 2.2e-16
# dunnTest(HRLYEARN ~ NOC_10, data = data.all.09male, method = "bonferroni") # Post Hoc
# Males 2019
occu.order.male19 <- with(data.all.19male, reorder(NOC_10short, HRLYEARN, median))
par(mar=c(16,5,2,1))
wage.occu.19male <- boxplot(HRLYEARN ~ occu.order.male19
                            , data = data.all.19male
                            , boxwex = 0.4
                            , ylim = c(0, 120)
                            , las = 2
                            , cex.axis = 1.0
                            , col = "royalblue"
                            , xlab = ""
                            , ylab = "Hourly Wages (HRLYEARN)"
                            , main = "Males 2019 - Hourly Wage by Occupation")
mtext("Occupation (NOC_10)", side=1, line=4.3)

rownames(wage.occu.19male$stats) <- c("LowFence", "Q1", "Median", "Q3", "UpFence")
colnames(wage.occu.19male$stats) <- c(levels(occu.order.male19))
wage.occu.19male$stats
##          Sales NatAgri ManUtil ArtCul Trades BusFin Health NatASc EduLaw  Mngt
## LowFence  3.04    3.21    6.92   9.85  3.460   4.62  10.00   5.13   3.00  3.30
## Q1       14.00   17.00   18.00  17.50 20.000  20.19  21.00  26.62  26.25 32.88
## Median   16.50   23.00   23.36  23.50 26.000  27.00  30.00  36.06  37.50 45.00
## Q3       23.00   33.00   32.00  31.25 34.625  37.50  40.49  46.63  48.08 57.69
## UpFence  36.13   57.00   53.00  51.00 56.500  63.46  68.68  76.51  79.91 94.87
wage.occu.19male$n #Sample size
##  [1] 5441 1243 1940  368 7492 2127  622 2751 1802 1713
table(wage.occu.19male$group) #Total Outliers
## 
##   1   2   3   4   5   6   7   8   9  10 
## 266  51  59   8  95  57  10  61  32  31
kruskal.test(HRLYEARN ~ NOC_10, data = data.all.19male) # Diff. among groups
## 
##  Kruskal-Wallis rank sum test
## 
## data:  HRLYEARN by NOC_10
## Kruskal-Wallis chi-squared = 7220.2, df = 9, p-value < 2.2e-16
# dunnTest(HRLYEARN ~ NOC_10, data = data.all.19male, method = "bonferroni") # Post Hoc
# Females 2009
occu.order.fem09 <- with(data.all.09fem, reorder(NOC_10short, HRLYEARN, median))
par(mar=c(16,5,2,1))
wage.occu.09fem <- boxplot(HRLYEARN ~ occu.order.fem09
                            , data = data.all.09fem
                            , boxwex = 0.4
                            , ylim = c(0, 120)
                            , las = 2
                            , cex.axis = 1.0
                            , col = "moccasin"
                            , xlab = ""
                            , ylab = "Hourly Wages (HRLYEARN)"
                            , main = "Females 2009 - Hourly Wage by Occupation")
mtext("Occupation (NOC_10)", side=1, line=4.3)

rownames(wage.occu.09fem$stats) <- c("LoFence", "Q1", "Median", "Q3", "UpFence")
colnames(wage.occu.09fem$stats) <- c(levels(occu.order.fem09))
wage.occu.09fem$stats
##          Sales NatAgri ManUtil Trades ArtCul BusFin Health NatASc EduLaw   Mngt
## LoFence  2.000    4.81   3.500   4.63   3.55   3.45   4.17  5.000  3.070  4.160
## Q1       9.500   10.00  10.970  11.50  13.00  14.35  17.55 19.220 18.495 18.000
## Median  11.000   12.00  13.685  15.00  18.00  18.50  23.90 25.640 26.000 26.555
## Q3      15.365   16.50  17.150  20.00  26.39  23.08  33.65 33.685 33.750 38.460
## UpFence 24.130   25.96  26.000  32.09  46.15  36.06  57.69 55.000 56.410 68.680
wage.occu.09fem$n #Sample size
##  [1] 8695  266  800  534  576 7388 3398  748 3664 1546
table(wage.occu.09fem$group) #Total Outliers
## 
##   1   2   3   4   5   6   7   8   9  10 
## 330  14  37  14  12 244  14  20  50  22
kruskal.test(HRLYEARN ~ NOC_10, data = data.all.09fem) # Diff. among groups
## 
##  Kruskal-Wallis rank sum test
## 
## data:  HRLYEARN by NOC_10
## Kruskal-Wallis chi-squared = 9838.5, df = 9, p-value < 2.2e-16
# dunnTest(HRLYEARN ~ NOC_10, data = data.all.09fem, method = "bonferroni") # Post Hoc
# Females 2019
occu.order.fem19 <- with(data.all.19fem, reorder(NOC_10short, HRLYEARN, median))
par(mar=c(16,5,2,1))
wage.occu.19fem <- boxplot(HRLYEARN ~ occu.order.fem19
                            , data = data.all.19fem
                            , boxwex = 0.4
                            , ylim = c(0, 120)
                            , las = 2
                            , cex.axis = 1.0
                            , col = "darkorange"
                            , xlab = ""
                            , ylab = "Hourly Wages (HRLYEARN)"
                            , main = "Females 2019 - Hourly Wage by Occupation")
mtext("Occupation (NOC_10)", side=1, line=4.3)

rownames(wage.occu.19fem$stats) <- c("LoFence", "Q1", "Median", "Q3", "UpFence")
colnames(wage.occu.19fem$stats) <- c(levels(occu.order.fem19))
wage.occu.19fem$stats
##         Sales NatAgri ManUtil Trades ArtCul BusFin Health EduLaw NatASc  Mngt
## LoFence  5.00    5.26   10.50   4.55   5.42   3.07   5.05   3.48   6.25  3.30
## Q1      13.75   14.50   15.00  16.54  16.00  19.49  21.00  20.00  24.52 26.92
## Median  15.00   17.00   17.50  20.00  21.00  24.34  27.79  27.88  33.64 38.46
## Q3      19.79   23.97   21.45  26.00  27.89  30.53  40.00  39.90  43.00 52.88
## UpFence 28.72   38.00   31.00  40.15  45.64  47.00  67.31  69.71  70.62 91.35
wage.occu.19fem$n #Sample size
##  [1] 7341  329  649  563  493 5958 3455 4569  832 1243
table(wage.occu.19fem$group) #Total Outliers
## 
##   1   2   3   4   5   6   7   8   9  10 
## 441  27  56  29   9 251  12  71  15  10
kruskal.test(HRLYEARN ~ NOC_10, data = data.all.19fem) # Diff. among groups
## 
##  Kruskal-Wallis rank sum test
## 
## data:  HRLYEARN by NOC_10
## Kruskal-Wallis chi-squared = 8336.7, df = 9, p-value < 2.2e-16
# dunnTest(HRLYEARN ~ NOC_10, data = data.all.19fem, method = "bonferroni") # Post Hoc
# 2009 v. 2019 ################################################################
# SECTOR ######################################################################
# Mann Whitney U Test
wilcox.test(HRLYEARN ~ COWMAIN,  data = data.all.09male, alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by COWMAIN
## W = 78664610, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  6.700064 7.270075
## sample estimates:
## difference in location 
##               6.999958
wilcox.test(HRLYEARN ~ COWMAIN,  data = data.all.19male, alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by COWMAIN
## W = 69385905, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  8.039925 8.999984
## sample estimates:
## difference in location 
##               8.500087
wilcox.test(HRLYEARN ~ COWMAIN,  data = data.all.09fem, alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by COWMAIN
## W = 133145579, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  8.959938 9.269923
## sample estimates:
## difference in location 
##               9.039943
wilcox.test(HRLYEARN ~ COWMAIN,  data = data.all.19fem, alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by COWMAIN
## W = 112929303, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##   9.710015 10.109941
## sample estimates:
## difference in location 
##               9.999988
# MALES
# Boxplot Analysis
par(mfrow=c(1, 1))
par(mar=c(16,5,2,1))
summary(data.all$HRLYEARN[data.all$SEX == "Male"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.14   16.00   23.00   26.19   33.00  115.38
sec.order.male <- with(data.all[data.all$SEX == "Male",], reorder(COWMAIN, HRLYEARN, median))
wage.sec.male <- boxplot(HRLYEARN ~ SURVYEAR * sec.order.male
                          , data = data.all[data.all$SEX == "Male",]
                          , boxwex = 0.4
                          , ylim = c(0, 120)
                          , las = 1
                          , cex.axis = 1.0
                          , col = c("slategray2", "royalblue")
                          , xaxt = "n"
                          , xlab = ""
                          , ylab = "Hourly Wages (HRLYEARN)"
                          , main = "Males Hourly Wage by Year and Sector")
# Label of X Axis
axis(1
     , at = seq(1.5, 4, 2)
     , labels = levels(sec.order.male)
     , tick=FALSE
     , cex=0.3
     , las = 1)
# Grey Vertical Lines
for(i in seq(0.5, 6, 2)){ 
  abline(v=i,lty=1, col="grey")
}
# Add a legend
legend("topleft", legend = c("Males 2009", "Males 2019"), 
       col=c("slategray2", "royalblue"),
       pch = 15, bty = "n", pt.cex = 2, cex = 1.0,  horiz = T, inset = c(0.01, 0.01))

# Boxplot Stats
rownames(wage.sec.male$stats) <- c("Lower Fence", "Q1", "Median", "Q3", "Upper Fence")
colnames(wage.sec.male$stats) <- wage.sec.male$names
wage.sec.male$stats # Boxplot Summary
##             2009.Private sector 2019.Private sector 2009.Public sector
## Lower Fence               2.140                3.00               3.13
## Q1                       13.615               17.31              20.00
## Median                   19.290               24.04              27.00
## Q3                       27.640               35.00              36.00
## Upper Fence              48.580               61.49              60.00
##             2019.Public sector
## Lower Fence               3.25
## Q1                       25.00
## Median                   34.00
## Q3                       45.00
## Upper Fence              75.00
wage.sec.male$n # Sample size
## [1] 21615 20581  5327  4918
table(wage.sec.male$group) # Total Outliers
## 
##   1   2   3   4 
## 653 662 105  67
# FEMALES
summary(data.all$HRLYEARN[data.all$SEX == "Female"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   14.00   19.72   22.58   28.05  106.67
sec.order.fem <- with(data.all[data.all$SEX == "Female",], reorder(COWMAIN, HRLYEARN, median))
wage.sec.fem <- boxplot(HRLYEARN ~ SURVYEAR * sec.order.fem
                         , data = data.all[data.all$SEX == "Female",]
                         , boxwex = 0.4
                         , ylim = c(0, 120)
                         , las = 1
                         , cex.axis = 1.0
                         , col = c("moccasin", "darkorange2")
                         , xaxt = "n"
                         , xlab = ""
                         , ylab = "Hourly Wages (HRLYEARN)"
                         , main = "Females Hourly Wage by Year and Sector")
# Label of X Axis
axis(1
     , at = seq(1.5, 4, 2)
     , labels = levels(sec.order.fem)
     , tick=FALSE
     , cex=0.3
     , las = 1)
# Grey Vertical Lines
for(i in seq(0.5, 6, 2)){ 
  abline(v=i,lty=1, col="grey")
}
# Add a legend
legend("topleft", legend = c("Females 2009", "Females 2019"), 
       col=c("moccasin", "darkorange"),
       pch = 15, bty = "n", pt.cex = 2, cex = 1.0,  horiz = T, inset = c(0.01, 0.01))

# Boxplot Stats
rownames(wage.sec.fem$stats) <- c("Lower Fence", "Q1", "Median", "Q3", "Upper Fence")
colnames(wage.sec.fem$stats) <- wage.sec.fem$names
wage.sec.fem$stats # Boxplot Summary
##             2009.Private sector 2019.Private sector 2009.Public sector
## Lower Fence                2.00                3.13              3.070
## Q1                        10.00               15.00             18.500
## Median                    14.07               18.75             24.000
## Q3                        20.00               25.82             32.695
## Upper Fence               35.00               42.05             53.850
##             2019.Public sector
## Lower Fence               3.07
## Q1                       23.00
## Median                   30.00
## Q3                       40.88
## Upper Fence              67.31
wage.sec.fem$n # Sample size
## [1] 18508 16650  9107  8782
table(wage.sec.fem$group) # Total Outliers
## 
##   1   2   3   4 
## 839 979 121 117
# 2009 v. 2019 ################################################################
# INDUSTRY ####################################################################
# MALES
# Mann Whitney U Test
wilcox.test(HRLYEARN ~ SURVYEAR,  data = data.all[data.all$SEX == "Male" & data.all$NAICS_18short == "Utils",], alt = "two.sided", conf.int = T) # Top1
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SURVYEAR
## W = 44871, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -14.07003 -10.89997
## sample estimates:
## difference in location 
##              -12.52998
wilcox.test(HRLYEARN ~ SURVYEAR,  data = data.all[data.all$SEX == "Male" & data.all$NAICS_18short == "PubAd",], alt = "two.sided", conf.int = T) # Top2
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SURVYEAR
## W = 1116584, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -8.049964 -6.410015
## sample estimates:
## difference in location 
##              -7.230063
wilcox.test(HRLYEARN ~ SURVYEAR,  data = data.all[data.all$SEX == "Male" & data.all$NAICS_18short == "Fores",], alt = "two.sided", conf.int = T) # Top3
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SURVYEAR
## W = 502548, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -10.849993  -8.589955
## sample estimates:
## difference in location 
##              -9.739952
wilcox.test(HRLYEARN ~ SURVYEAR,  data = data.all[data.all$SEX == "Male" & data.all$NAICS_18short == "Educa",], alt = "two.sided", conf.int = T) # Top4
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SURVYEAR
## W = 677554, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -8.579975 -6.310040
## sample estimates:
## difference in location 
##              -7.450054
wilcox.test(HRLYEARN ~ SURVYEAR,  data = data.all[data.all$SEX == "Male" & data.all$NAICS_18short == "ProSc",], alt = "two.sided", conf.int = T) # Top5
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SURVYEAR
## W = 554060, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -6.899933 -4.619922
## sample estimates:
## difference in location 
##              -5.719948
# Boxplot Analysis
par(mfrow=c(1, 1))
par(mar=c(16,5,2,1))
summary(data.all$HRLYEARN[data.all$SEX == "Male"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.14   16.00   23.00   26.19   33.00  115.38
ind.order.male <- with(data.all[data.all$SEX == "Male",], reorder(NAICS_18short, HRLYEARN, median))
wage.ind.male <- boxplot(HRLYEARN ~ SURVYEAR * ind.order.male
                         , data = data.all[data.all$SEX == "Male",]
                         , boxwex = 0.4
                         , ylim = c(0, 120)
                         , cex.axis = 1.0
                         , col = c("slategray2", "royalblue")
                         , xaxt = "n"
                         , xlab = ""
                         , ylab = "Hourly Wages (HRLYEARN)"
                         , main = "Males Hourly Wage by Year and Industry")
mtext("Industry (NAICS_18)", side=1, line=4.3)
# Label of X Axis
axis(1
     , at = seq(1.5, 36, 2)
     , labels = levels(ind.order.male)
     , tick=FALSE
     , cex=0.3
     , las = 2)
# Grey Vertical Lines
for(i in seq(0.5, 40, 2)){ 
  abline(v=i,lty=1, col="grey")
}
# Add a legend
legend("topleft", legend = c("Males 2009", "Males 2019"), 
       col=c("slategray2", "royalblue"),
       pch = 15, bty = "n", pt.cex = 2, cex = 1.0,  horiz = T, inset = c(0.01, 0.01))

# Boxplot Stats
rownames(wage.ind.male$stats) <- c("Lower Fence", "Q1", "Median", "Q3", "Upper Fence")
colnames(wage.ind.male$stats) <- wage.ind.male$names
wage.ind.male$stats # Boxplot Summary
##             2009.AcFood 2019.AcFood 2009.Rtail 2019.Rtail 2009.Agri 2019.Agri
## Lower Fence        5.00        6.92       3.79      3.040      2.14      3.30
## Q1                 9.35       13.15      10.00     14.000     10.00     15.00
## Median            11.00       15.00      14.00     16.750     13.55     18.47
## Q3                15.00       17.61      20.14     24.855     17.31     23.08
## Upper Fence       23.08       24.18      35.20     41.080     28.00     35.00
##             2009.Mngt 2019.Mngt 2009.Other 2019.Other 2009.Info 2019.Info
## Lower Fence      3.33      4.56       3.13      3.000     3.380      4.62
## Q1              11.00     15.00      13.70     18.000    13.000     16.00
## Median          14.00     18.50      18.90     23.080    19.815     23.05
## Q3              20.00     25.00      25.00     30.965    28.850     34.07
## Upper Fence     33.00     40.00      41.83     50.000    51.920     60.22
##             2009.ManuN 2019.ManuN 2009.Trans 2019.Trans 2009.Whole 2019.Whole
## Lower Fence       5.45       3.53      3.210       5.00       4.25       9.62
## Q1               15.00      18.50     16.000      19.35      15.00      20.00
## Median           20.00      25.00     20.510      25.00      20.00      26.00
## Q3               28.00      34.00     25.985      32.00      27.47      36.00
## Upper Fence      47.16      56.54     40.870      50.96      46.15      58.50
##             2009.ManuD 2019.ManuD 2009.Health 2019.Health 2009.Const 2019.Const
## Lower Fence       3.48       5.29       3.300        4.81       5.10      8.170
## Q1               17.00      20.00      16.465       20.00      17.00     22.000
## Median           22.00      26.00      21.450       26.25      22.00     28.745
## Q3               29.51      35.00      31.000       37.90      29.87     37.000
## Upper Fence      48.08      57.50      52.200       63.37      49.15     58.000
##             2009.Finan 2019.Finan 2009.Fores 2019.Fores 2009.ProSc 2019.ProSc
## Lower Fence       3.75       3.53       4.44       3.21       5.00       5.77
## Q1               15.34      21.00      19.78      27.78      20.19      25.00
## Median           22.00      29.77      28.00      37.00      28.29      35.00
## Q3               33.65      42.31      34.62      49.04      38.74      46.67
## Upper Fence      60.00      72.92      55.77      80.77      65.93      78.85
##             2009.Educa 2019.Educa 2009.PubAd 2019.PubAd 2009.Utils 2019.Utils
## Lower Fence      3.610       3.25       3.13       5.13       5.13     14.000
## Q1              19.700      24.22      22.03      28.00      23.50     35.000
## Median          28.280      36.06      30.00      37.00      30.64     43.475
## Q3              37.555      48.08      37.91      47.00      38.00     52.000
## Upper Fence     64.100      82.05      61.54      75.00      58.24     76.920
wage.ind.male$n # Sample size
##  [1] 1429 1336 2994 2792  497  457  992  955  929  819 1106  981 1776 1489 1860
## [16] 1872 1258 1108 2605 2207 1067 1205 3209 3264  971 1014 1330 1242 1086 1326
## [31] 1460 1306 1880 1746  493  380
table(wage.ind.male$group) # Total Outliers
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##  80 118 136 155  14  33  64  53  33  36  23  21  60  61  96  71  49  40  76  66 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##  29  29  41  60  30  39  49  17  38  37  19  19  39  18  10   7
# FEMALES
# Mann Whitney U Test
wilcox.test(HRLYEARN ~ SURVYEAR,  data = data.all[data.all$SEX == "Female" & data.all$NAICS_18short == "Utils",], alt = "two.sided", conf.int = T) # Top1
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SURVYEAR
## W = 3277.5, p-value = 3.859e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -14.999966  -9.439992
## sample estimates:
## difference in location 
##                 -12.13
wilcox.test(HRLYEARN ~ SURVYEAR,  data = data.all[data.all$SEX == "Female" & data.all$NAICS_18short == "Fores",], alt = "two.sided", conf.int = T) # Top3
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SURVYEAR
## W = 18717, p-value = 2.28e-10
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -10.649967  -5.650075
## sample estimates:
## difference in location 
##              -8.030013
wilcox.test(HRLYEARN ~ SURVYEAR,  data = data.all[data.all$SEX == "Female" & data.all$NAICS_18short == "PubAd",], alt = "two.sided", conf.int = T) # Top2
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SURVYEAR
## W = 1070692, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -7.280006 -5.999961
## sample estimates:
## difference in location 
##              -6.669972
wilcox.test(HRLYEARN ~ SURVYEAR,  data = data.all[data.all$SEX == "Female" & data.all$NAICS_18short == "Educa",], alt = "two.sided", conf.int = T) # Top4
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SURVYEAR
## W = 3546959, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -6.269986 -5.099947
## sample estimates:
## difference in location 
##              -5.749971
wilcox.test(HRLYEARN ~ SURVYEAR,  data = data.all[data.all$SEX == "Female" & data.all$NAICS_18short == "ProSc",], alt = "two.sided", conf.int = T) # Top5
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SURVYEAR
## W = 404999, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -6.970001 -5.270070
## sample estimates:
## difference in location 
##              -6.040043
# Boxplot Analysis
par(mfrow=c(1, 1))
par(mar=c(16,5,2,1))
summary(data.all$HRLYEARN[data.all$SEX == "Female"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   14.00   19.72   22.58   28.05  106.67
ind.order.fem <- with(data.all[data.all$SEX == "Female",], reorder(NAICS_18short, HRLYEARN, median))
wage.ind.fem <- boxplot(HRLYEARN ~ SURVYEAR * ind.order.fem
                         , data = data.all[data.all$SEX == "Female",]
                         , boxwex = 0.4
                         , ylim = c(0, 120)
                         , cex.axis = 1.0
                         , col = c("moccasin", "darkorange")
                         , xaxt = "n"
                         , xlab = ""
                         , ylab = "Hourly Wages (HRLYEARN)"
                         , main = "Females Hourly Wage by Year and Industry")
mtext("Industry (NAICS_18)", side=1, line=4.3)
# Label of X Axis
axis(1
     , at = seq(1.5, 36, 2)
     , labels = levels(ind.order.fem)
     , tick=FALSE
     , cex=0.3
     , las = 2)
# Grey Vertical Lines
for(i in seq(0.5, 40, 2)){ 
  abline(v=i,lty=1, col="grey")
}
# Add a legend
legend("topleft", legend = c("Females 2009", "Females 2019"), 
       col=c("moccasin", "darkorange"),
       pch = 15, bty = "n", pt.cex = 2, cex = 1.0,  horiz = T, inset = c(0.01, 0.01))

# Boxplot Stats
rownames(wage.ind.fem$stats) <- c("Lower Fence", "Q1", "Median", "Q3", "Upper Fence")
colnames(wage.ind.fem$stats) <- wage.ind.fem$names
wage.ind.fem$stats # Boxplot Summary
##             2009.AcFood 2019.AcFood 2009.Rtail 2019.Rtail 2009.Agri 2019.Agri
## Lower Fence        4.77        7.41       3.66       8.25      4.81     5.770
## Q1                 9.00       13.00       9.50      13.50      9.70    14.000
## Median            10.00       14.50      10.95      15.00     11.00    16.000
## Q3                13.00       16.86      14.50      19.00     14.00    20.875
## Upper Fence       19.00       22.50      22.00      27.25     20.00    30.000
##             2009.Mngt 2019.Mngt 2009.Other 2019.Other 2009.ManuN 2019.ManuN
## Lower Fence      5.03      5.26       2.00       4.81       5.77      10.50
## Q1              10.30     14.77      10.53      15.00      11.50      15.34
## Median          13.00     18.00      15.00      19.17      15.00      19.75
## Q3              17.00     23.67      19.56      27.40      19.35      26.00
## Upper Fence     27.00     36.54      32.97      45.05      31.00      41.03
##             2009.Info 2019.Info 2009.Whole 2019.Whole 2009.Const 2019.Const
## Lower Fence      3.55      5.00       5.13      10.30       3.50       6.07
## Q1              11.00     15.00      13.00      17.50      13.85      18.00
## Median          16.00     20.00      16.83      22.09      16.92      23.00
## Q3              23.00     28.35      21.63      28.85      20.77      28.90
## Upper Fence     40.87     48.21      34.00      45.67      30.29      45.00
##             2009.ManuD 2019.ManuD 2009.Trans 2019.Trans 2009.Finan 2019.Finan
## Lower Fence      5.490       6.92       3.55      3.070       5.22      4.360
## Q1              14.420      18.03      13.50     17.465      15.00     19.975
## Median          18.000      22.50      18.00     21.720      19.23     25.640
## Q3              24.105      29.81      23.12     27.295      25.00     34.055
## Upper Fence     38.460      46.77      37.33     42.000      40.00     54.950
##             2009.Health 2019.Health 2009.ProSc 2019.ProSc 2009.Educa 2019.Educa
## Lower Fence        3.50        3.50       3.08       3.13       3.07       3.48
## Q1                15.87       19.00      15.00      20.00      18.00      23.00
## Median            20.00       24.04      20.00      26.44      24.62      30.00
## Q3                29.07       35.00      28.21      35.49      33.65      42.31
## Upper Fence       48.73       59.00      47.99      58.00      57.05      71.15
##             2009.PubAd 2019.PubAd 2009.Utils 2019.Utils 2009.Fores 2019.Fores
## Lower Fence       5.49      4.810       3.90      16.07       5.00      4.730
## Q1               20.00     25.295      19.00      27.85      18.46     23.875
## Median           25.00     31.370      24.16      37.00      25.25     34.000
## Q3               31.87     41.325      29.78      46.15      33.65     45.095
## Upper Fence      49.60     65.000      44.00      72.82      55.27     72.120
wage.ind.fem$n # Sample size
##  [1] 2518 2204 4155 3333  221  252  821  666 1127  897 1007  819 1079  877  511
## [16]  482  381  482  640  529  677  687 1823 1492 6213 6227 1074 1168 3114 3159
## [31] 1878 1804  140  115  236  239
table(wage.ind.fem$group) # Total Outliers
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
## 178 184 301 272  14  17  54  26  68  35  53  57  30  32  15  23  25  26  31  24 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##  17  44  98  72  62  60  37  40  47  43  59  46   5   2   9   1
# 2009 v. 2019 ################################################################
# OCCUPATION ##################################################################
# MALES
# Mann Whitney U Test
wilcox.test(HRLYEARN ~ SURVYEAR,  data = data.all[data.all$SEX == "Male" & data.all$NOC_10short == "Mngt",], alt = "two.sided", conf.int = T) # Top1
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SURVYEAR
## W = 1120265, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -12.25999 -10.00005
## sample estimates:
## difference in location 
##              -11.17997
wilcox.test(HRLYEARN ~ SURVYEAR,  data = data.all[data.all$SEX == "Male" & data.all$NOC_10short == "EduLaw",], alt = "two.sided", conf.int = T) # Top2
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SURVYEAR
## W = 966244, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -8.059976 -6.049982
## sample estimates:
## difference in location 
##               -7.05005
wilcox.test(HRLYEARN ~ SURVYEAR,  data = data.all[data.all$SEX == "Male" & data.all$NOC_10short == "NatASc",], alt = "two.sided", conf.int = T) # Top3
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SURVYEAR
## W = 2564647, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -7.489955 -6.020037
## sample estimates:
## difference in location 
##              -6.770048
# Boxplot Analysis
occu.order.male <- with(data.all[data.all$SEX == "Male",], reorder(NOC_10short, HRLYEARN, median))
par(mar=c(16,5,2,1))
wage.occu.male <- boxplot(HRLYEARN ~ SURVYEAR * occu.order.male
                            , data = data.all[data.all$SEX == "Male",]
                            , boxwex = 0.4
                            , las = 2
                            , cex.axis = 1.0
                            , col = c("slategray2", "royalblue")
                            , xaxt = "n"
                            , xlab = ""
                            , ylab = "Hourly Wages (HRLYEARN)"
                            , main = "Males Hourly Wage by Year and Occupation")
mtext("Occupation (NOC_10)", side=1, line=4.3)
# Label of X Axis
axis(1
     , at = seq(1.5, 20, 2)
     , labels = levels(occu.order.male)
     , tick=FALSE
     , cex=0.3
     , las = 2)
# Grey Vertical Lines
for(i in seq(0.5, 20, 2)){ 
  abline(v=i,lty=1, col="grey")
}
# Add a legend
legend("topleft", legend = c("Males 2009", "Males 2019"), 
       col=c("slategray2", "royalblue"),
       pch = 15, bty = "n", pt.cex = 2, cex = 1.0,  horiz = T, inset = c(0.01, 0.01))

wage.occu.male$stats
##       [,1]  [,2]  [,3]  [,4]  [,5]  [,6]   [,7]  [,8]   [,9]  [,10] [,11] [,12]
## [1,]  3.13  3.04  2.14  3.21  6.67  6.92  4.210  9.85  3.210  3.460  3.13  4.62
## [2,] 10.00 14.00 12.00 17.00 15.00 18.00 15.000 17.50 16.000 20.000 15.00 20.19
## [3,] 14.00 16.50 17.00 23.00 19.50 23.36 20.295 23.50 20.675 26.000 20.19 27.00
## [4,] 20.00 23.00 25.18 33.00 25.00 32.00 28.850 31.25 27.300 34.625 28.69 37.50
## [5,] 35.00 36.13 44.41 57.00 40.00 53.00 48.210 51.00 44.230 56.500 49.04 63.46
##      [,13] [,14] [,15] [,16]  [,17] [,18] [,19] [,20]
## [1,]  7.00 10.00  3.79  5.13  3.610  3.00  3.48  3.30
## [2,] 18.50 21.00 21.63 26.62 20.975 26.25 23.08 32.88
## [3,] 25.00 30.00 28.85 36.06 30.000 37.50 33.00 45.00
## [4,] 34.59 40.49 38.41 46.63 38.460 48.08 45.64 57.69
## [5,] 57.69 68.68 63.57 76.51 64.100 79.91 79.47 94.87
# Boxplot Stats
rownames(wage.occu.male$stats) <- c("LowFence", "Q1", "Median", "Q3", "UpFence")
colnames(wage.occu.male$stats) <- wage.occu.male$names
wage.occu.male$stats # Boxplot Summary
##          2009.Sales 2019.Sales 2009.NatAgri 2019.NatAgri 2009.ManUtil
## LowFence       3.13       3.04         2.14         3.21         6.67
## Q1            10.00      14.00        12.00        17.00        15.00
## Median        14.00      16.50        17.00        23.00        19.50
## Q3            20.00      23.00        25.18        33.00        25.00
## UpFence       35.00      36.13        44.41        57.00        40.00
##          2019.ManUtil 2009.ArtCul 2019.ArtCul 2009.Trades 2019.Trades
## LowFence         6.92       4.210        9.85       3.210       3.460
## Q1              18.00      15.000       17.50      16.000      20.000
## Median          23.36      20.295       23.50      20.675      26.000
## Q3              32.00      28.850       31.25      27.300      34.625
## UpFence         53.00      48.210       51.00      44.230      56.500
##          2009.BusFin 2019.BusFin 2009.Health 2019.Health 2009.NatASc
## LowFence        3.13        4.62        7.00       10.00        3.79
## Q1             15.00       20.19       18.50       21.00       21.63
## Median         20.19       27.00       25.00       30.00       28.85
## Q3             28.69       37.50       34.59       40.49       38.41
## UpFence        49.04       63.46       57.69       68.68       63.57
##          2019.NatASc 2009.EduLaw 2019.EduLaw 2009.Mngt 2019.Mngt
## LowFence        5.13       3.610        3.00      3.48      3.30
## Q1             26.62      20.975       26.25     23.08     32.88
## Median         36.06      30.000       37.50     33.00     45.00
## Q3             46.63      38.460       48.08     45.64     57.69
## UpFence        76.51      64.100       79.91     79.47     94.87
wage.occu.male$n # Sample size
##  [1] 5883 5441 1315 1243 2025 1940  376  368 8232 7492 2434 2127  525  622 2632
## [16] 2751 1479 1802 2041 1713
table(wage.occu.male$group) # Total Outliers
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
## 355 266  41  51  46  59   5   8 108  95  60  57  14  10  60  61  34  32  18  31
# FEMALES
# Mann Whitney U Test
wilcox.test(HRLYEARN ~ SURVYEAR,  data = data.all[data.all$SEX == "Female" & data.all$NOC_10short == "Mngt",], alt = "two.sided", conf.int = T) # Top1
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SURVYEAR
## W = 593358, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -12.019993  -9.619946
## sample estimates:
## difference in location 
##              -10.81004
wilcox.test(HRLYEARN ~ SURVYEAR,  data = data.all[data.all$SEX == "Female" & data.all$NOC_10short == "NatASc",], alt = "two.sided", conf.int = T) # Top2
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SURVYEAR
## W = 208380, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -8.200015 -5.870033
## sample estimates:
## difference in location 
##              -7.000035
wilcox.test(HRLYEARN ~ SURVYEAR,  data = data.all[data.all$SEX == "Female" & data.all$NOC_10short == "EduLaw",], alt = "two.sided", conf.int = T) # Top3
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SURVYEAR
## W = 7142727, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -3.560001 -2.509974
## sample estimates:
## difference in location 
##              -3.000021
# Boxplot Analysis
occu.order.fem <- with(data.all[data.all$SEX == "Female",], reorder(NOC_10short, HRLYEARN, median))
par(mar=c(16,5,2,1))
wage.occu.fem <- boxplot(HRLYEARN ~ SURVYEAR * occu.order.fem
                          , data = data.all[data.all$SEX == "Female",]
                          , boxwex = 0.4
                          , las = 2
                          , cex.axis = 1.0
                          , col = c("moccasin", "darkorange")
                          , xaxt = "n"
                          , xlab = ""
                          , ylab = "Hourly Wages (HRLYEARN)"
                          , main = "Females Hourly Wage by Year and Occupation")
mtext("Occupation (NOC_10)", side=1, line=4.3)
# Label of X Axis
axis(1
     , at = seq(1.5, 20, 2)
     , labels = levels(occu.order.fem)
     , tick=FALSE
     , cex=0.3
     , las = 2)
# Grey Vertical Lines
for(i in seq(0.5, 20, 2)){ 
  abline(v=i,lty=1, col="grey")
}
# Add a legend
legend("topleft", legend = c("Females 2009", "Females 2019"), 
       col=c("moccasin", "darkorange"),
       pch = 15, bty = "n", pt.cex = 2, cex = 1.0,  horiz = T, inset = c(0.01, 0.01))

wage.occu.fem$stats
##        [,1]  [,2]  [,3]  [,4]   [,5]  [,6]  [,7]  [,8]  [,9] [,10] [,11] [,12]
## [1,]  2.000  5.00  4.81  5.26  3.500 10.50  4.63  4.55  3.55  5.42  3.45  3.07
## [2,]  9.500 13.75 10.00 14.50 10.970 15.00 11.50 16.54 13.00 16.00 14.35 19.49
## [3,] 11.000 15.00 12.00 17.00 13.685 17.50 15.00 20.00 18.00 21.00 18.50 24.34
## [4,] 15.365 19.79 16.50 23.97 17.150 21.45 20.00 26.00 26.39 27.89 23.08 30.53
## [5,] 24.130 28.72 25.96 38.00 26.000 31.00 32.09 40.15 46.15 45.64 36.06 47.00
##      [,13] [,14]  [,15] [,16]  [,17] [,18]  [,19] [,20]
## [1,]  4.17  5.05  3.070  3.48  5.000  6.25  4.160  3.30
## [2,] 17.55 21.00 18.495 20.00 19.220 24.52 18.000 26.92
## [3,] 23.90 27.79 26.000 27.88 25.640 33.64 26.555 38.46
## [4,] 33.65 40.00 33.750 39.90 33.685 43.00 38.460 52.88
## [5,] 57.69 67.31 56.410 69.71 55.000 70.62 68.680 91.35
# Boxplot Stats
rownames(wage.occu.fem$stats) <- c("LoFence", "Q1", "Median", "Q3", "UpFence")
colnames(wage.occu.fem$stats) <- wage.occu.fem$names
wage.occu.fem$stats # Boxplot Summary
##         2009.Sales 2019.Sales 2009.NatAgri 2019.NatAgri 2009.ManUtil
## LoFence      2.000       5.00         4.81         5.26        3.500
## Q1           9.500      13.75        10.00        14.50       10.970
## Median      11.000      15.00        12.00        17.00       13.685
## Q3          15.365      19.79        16.50        23.97       17.150
## UpFence     24.130      28.72        25.96        38.00       26.000
##         2019.ManUtil 2009.Trades 2019.Trades 2009.ArtCul 2019.ArtCul
## LoFence        10.50        4.63        4.55        3.55        5.42
## Q1             15.00       11.50       16.54       13.00       16.00
## Median         17.50       15.00       20.00       18.00       21.00
## Q3             21.45       20.00       26.00       26.39       27.89
## UpFence        31.00       32.09       40.15       46.15       45.64
##         2009.BusFin 2019.BusFin 2009.Health 2019.Health 2009.EduLaw 2019.EduLaw
## LoFence        3.45        3.07        4.17        5.05       3.070        3.48
## Q1            14.35       19.49       17.55       21.00      18.495       20.00
## Median        18.50       24.34       23.90       27.79      26.000       27.88
## Q3            23.08       30.53       33.65       40.00      33.750       39.90
## UpFence       36.06       47.00       57.69       67.31      56.410       69.71
##         2009.NatASc 2019.NatASc 2009.Mngt 2019.Mngt
## LoFence       5.000        6.25     4.160      3.30
## Q1           19.220       24.52    18.000     26.92
## Median       25.640       33.64    26.555     38.46
## Q3           33.685       43.00    38.460     52.88
## UpFence      55.000       70.62    68.680     91.35
wage.occu.fem$n # Sample size
##  [1] 8695 7341  266  329  800  649  534  563  576  493 7388 5958 3398 3455 3664
## [16] 4569  748  832 1546 1243
table(wage.occu.fem$group) # Total Outliers
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
## 330 441  14  27  37  56  14  29  12   9 244 251  14  12  50  71  20  15  22  10
# Males v. Females #############################################################
# SECTOR #######################################################################
# Mann Whitney U Tests
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$COWMAIN == "Private sector",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 218257664, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  4.000026 4.499960
## sample estimates:
## difference in location 
##               4.220031
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$COWMAIN == "Public sector",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 24650974, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  2.729960 3.609973
## sample estimates:
## difference in location 
##               3.090046
# Boxplot Analysis
sec.order.19 <- with(data.all[data.all$SURVYEAR == 2019,], reorder(COWMAIN, HRLYEARN, median))
par(mar=c(16,5,2,1))
wage.sec.19 <- boxplot(HRLYEARN ~ SEX * sec.order.19
                        , data = data.all[data.all$SURVYEAR == 2019,]
                        , boxwex = 0.4
                        , ylim = c(0, 120)
                        , las = 2
                        , cex.axis = 1.0
                        , col = c("royalblue", "darkorange")
                        , xaxt = "n"
                        , xlab = ""
                        , ylab = "Hourly Wages (HRLYEARN)"
                        , main = "Hourly Wage by Sector - 2019 Gender Comparison")
# Label of X Axis
axis(1
     , at = seq(1.5, 4, 2)
     , labels = levels(sec.order.19)
     , tick=FALSE
     , cex=0.3
     , las = 1)
# Grey Vertical Lines
for(i in seq(0.5, 6, 2)){ 
  abline(v=i,lty=1, col="grey")
}
# Add a legend
legend("topleft", legend = c("Males 2019", "Females 2019"), 
       col=c("royalblue", "darkorange"),
       pch = 15, bty = "n", pt.cex = 2, cex = 1.0,  horiz = T, inset = c(0.01, 0.01))

wage.sec.19$stats
##       [,1]  [,2]  [,3]  [,4]
## [1,]  3.00  3.13  3.25  3.07
## [2,] 17.31 15.00 25.00 23.00
## [3,] 24.04 18.75 34.00 30.00
## [4,] 35.00 25.82 45.00 40.88
## [5,] 61.49 42.05 75.00 67.31
# Males v. Females #############################################################
# INDUSTRY #####################################################################
# Mann Whitney U Tests
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Agri",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 68024, p-value = 6.254e-05
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  0.9999976 2.5000029
## sample estimates:
## difference in location 
##               1.750036
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Fores",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 166042, p-value = 0.003606
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  1.000054 5.290070
## sample estimates:
## difference in location 
##               3.100048
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Utils",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 27312, p-value = 4.826e-05
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  3.089948 8.900052
## sample estimates:
## difference in location 
##               6.000049
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Const",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 1020349, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  3.999967 5.829932
## sample estimates:
## difference in location 
##               4.999997
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "ManuD",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 699118, p-value = 1.544e-12
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  2.100034 3.950029
## sample estimates:
## difference in location 
##               3.000056
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "ManuN",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 790826, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  3.550011 5.000051
## sample estimates:
## difference in location 
##                4.24996
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Whole",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 327178, p-value = 8.746e-13
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  2.500003 4.500087
## sample estimates:
## difference in location 
##               3.499906
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Rtail",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 5449884, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  1.000037 1.500014
## sample estimates:
## difference in location 
##                1.25008
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Trans",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 756647, p-value = 6.886e-12
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  1.929985 3.349944
## sample estimates:
## difference in location 
##               2.600061
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Finan",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 879092, p-value = 5.238e-12
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  2.469944 4.499994
## sample estimates:
## difference in location 
##               3.479976
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "ProSc",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 1019833, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  6.169975 8.279945
## sample estimates:
## difference in location 
##               7.210066
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Mngt",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 346817, p-value = 0.001885
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  0.250000 1.499922
## sample estimates:
## difference in location 
##              0.9999967
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Educa",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 2383919, p-value = 2.516e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  3.000094 4.999943
## sample estimates:
## difference in location 
##               4.000059
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Health",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 4094719, p-value = 4.884e-07
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  0.9999984 2.1500867
## sample estimates:
## difference in location 
##               1.599926
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Info",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 500721, p-value = 9.825e-10
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  1.770008 3.499980
## sample estimates:
## difference in location 
##               2.549954
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "AcFood",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 1550054, p-value = 0.008176
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  4.414496e-05 4.999908e-01
## sample estimates:
## difference in location 
##                0.14998
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Other",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 456494, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  2.819933 4.359998
## sample estimates:
## difference in location 
##               3.500032
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "PubAd",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 1873268, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  3.380016 5.049965
## sample estimates:
## difference in location 
##               4.219966
# Boxplot Analysis
ind.order.19 <- with(data.all[data.all$SURVYEAR == 2019,], reorder(NAICS_18short, HRLYEARN, median))
par(mar=c(16,5,2,1))
wage.ind.19 <- boxplot(HRLYEARN ~ SEX * ind.order.19
                        , data = data.all[data.all$SURVYEAR == 2019,]
                        , boxwex = 0.4
                        , ylim = c(0, 120)
                        , las = 2
                        , cex.axis = 1.0
                        , col = c("royalblue", "darkorange")
                        , xaxt = "n"
                        , xlab = ""
                        , ylab = "Hourly Wages (HRLYEARN)"
                        , main = "Hourly Wage by Industry - 2019 Gender Comparison")
mtext("Industry (NAICS_18)", side=1, line=4.3)
# Label of X Axis
axis(1
     , at = seq(1.5, 36, 2)
     , labels = levels(ind.order.19)
     , tick=FALSE
     , cex=0.3
     , las = 2)
# Grey Vertical Lines
for(i in seq(0.5, 40, 2)){ 
  abline(v=i,lty=1, col="grey")
}
# Add a legend
legend("topleft", legend = c("Males 2019", "Females 2019"), 
       col=c("royalblue", "darkorange"),
       pch = 15, bty = "n", pt.cex = 2, cex = 1.0,  horiz = T, inset = c(0.01, 0.01))

wage.ind.19$stats
##       [,1]  [,2]   [,3]  [,4]  [,5]   [,6]  [,7]  [,8]   [,9] [,10] [,11] [,12]
## [1,]  6.92  7.41  3.040  8.25  3.30  5.770  4.56  5.26  3.000  4.81  4.62  5.00
## [2,] 13.15 13.00 14.000 13.50 15.00 14.000 15.00 14.77 18.000 15.00 16.00 15.00
## [3,] 15.00 14.50 16.750 15.00 18.47 16.000 18.50 18.00 23.080 19.17 23.05 20.00
## [4,] 17.61 16.86 24.855 19.00 23.08 20.875 25.00 23.67 30.965 27.40 34.07 28.35
## [5,] 24.18 22.50 41.080 27.25 35.00 30.000 40.00 36.54 50.000 45.05 60.22 48.21
##      [,13] [,14] [,15]  [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23]  [,24]
## [1,]  3.53 10.50  5.00  3.070  4.81  3.50  5.29  6.92  9.62 10.30  3.53  4.360
## [2,] 18.50 15.34 19.35 17.465 20.00 19.00 20.00 18.03 20.00 17.50 21.00 19.975
## [3,] 25.00 19.75 25.00 21.720 26.25 24.04 26.00 22.50 26.00 22.09 29.77 25.640
## [4,] 34.00 26.00 32.00 27.295 37.90 35.00 35.00 29.81 36.00 28.85 42.31 34.055
## [5,] 56.54 41.03 50.96 42.000 63.37 59.00 57.50 46.77 58.50 45.67 72.92 54.950
##       [,25] [,26] [,27] [,28] [,29] [,30] [,31]  [,32] [,33]  [,34]  [,35]
## [1,]  8.170  6.07  5.77  3.13  3.25  3.48  5.13  4.810  3.21  4.730 14.000
## [2,] 22.000 18.00 25.00 20.00 24.22 23.00 28.00 25.295 27.78 23.875 35.000
## [3,] 28.745 23.00 35.00 26.44 36.06 30.00 37.00 31.370 37.00 34.000 43.475
## [4,] 37.000 28.90 46.67 35.49 48.08 42.31 47.00 41.325 49.04 45.095 52.000
## [5,] 58.000 45.00 78.85 58.00 82.05 71.15 75.00 65.000 80.77 72.120 76.920
##      [,36]
## [1,] 16.07
## [2,] 27.85
## [3,] 37.00
## [4,] 46.15
## [5,] 72.82
# Males v. Females #############################################################
# OCCUPATION ###################################################################
# Mann Whitney U Tests
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Mngt",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 1268072, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  4.809938 7.350044
## sample estimates:
## difference in location 
##               6.080045
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "NatASc",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 1290381, p-value = 2.365e-08
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  1.950024 4.000029
## sample estimates:
## difference in location 
##               2.999974
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "EduLaw",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 5310381, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  6.639996 8.200009
## sample estimates:
## difference in location 
##               7.430066
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Health",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 1140716, p-value = 0.01428
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  0.1500462 2.1200176
## sample estimates:
## difference in location 
##               1.150062
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Trades",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 2803271, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  4.080045 5.749970
## sample estimates:
## difference in location 
##               4.999944
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "BusFin",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 7312384, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  2.130060 3.169963
## sample estimates:
## difference in location 
##               2.709973
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "ArtCul",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 104442, p-value = 0.0001424
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  1.000031 3.500068
## sample estimates:
## difference in location 
##               2.219955
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "ManUtil",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 903084, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  4.500042 5.949957
## sample estimates:
## difference in location 
##               5.050023
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "NatAgri",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 269920, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  3.500010 5.710072
## sample estimates:
## difference in location 
##               4.619989
wilcox.test(HRLYEARN ~ SEX,  data = data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Sales",], alt = "two.sided", conf.int = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HRLYEARN by SEX
## W = 22827094, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  0.9999802 1.1500083
## sample estimates:
## difference in location 
##               1.000046
#Boxplot Analysis
occu.order.19 <- with(data.all[data.all$SURVYEAR == 2019,], reorder(NOC_10short, HRLYEARN, median))
par(mar=c(16,5,2,1))
wage.occu.19 <- boxplot(HRLYEARN ~ SEX * occu.order.19
                          , data = data.all[data.all$SURVYEAR == 2019,]
                          , boxwex = 0.4
                          , ylim = c(0, 120)
                          , las = 2
                          , cex.axis = 1.0
                          , col = c("royalblue", "darkorange")
                          , xaxt = "n"
                          , xlab = ""
                          , ylab = "Hourly Wages (HRLYEARN)"
                          , main = "Hourly Wage by Occupation - 2019 Gender Comparison")
# Label of X Axis
axis(1
     , at = seq(1.5, 20, 2)
     , labels = levels(occu.order.19)
     , tick=FALSE
     , cex=0.3
     , las = 2)
# Grey Vertical Lines
for(i in seq(0.5, 20, 2)){ 
  abline(v=i,lty=1, col="grey")
}
# Add a legend
legend("topleft", legend = c("Males 2019", "Females 2019"), 
       col=c("royalblue", "darkorange"),
       pch = 15, bty = "n", pt.cex = 2, cex = 1.0,  horiz = T, inset = c(0.01, 0.01))

wage.occu.19$stats
##       [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8]  [,9] [,10]  [,11] [,12]
## [1,]  3.04  5.00  3.21  5.26  6.92 10.50  9.85  5.42  4.62  3.07  3.460  4.55
## [2,] 14.00 13.75 17.00 14.50 18.00 15.00 17.50 16.00 20.19 19.49 20.000 16.54
## [3,] 16.50 15.00 23.00 17.00 23.36 17.50 23.50 21.00 27.00 24.34 26.000 20.00
## [4,] 23.00 19.79 33.00 23.97 32.00 21.45 31.25 27.89 37.50 30.53 34.625 26.00
## [5,] 36.13 28.72 57.00 38.00 53.00 31.00 51.00 45.64 63.46 47.00 56.500 40.15
##      [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20]
## [1,] 10.00  5.05  3.00  3.48  5.13  6.25  3.30  3.30
## [2,] 21.00 21.00 26.25 20.00 26.62 24.52 32.88 26.92
## [3,] 30.00 27.79 37.50 27.88 36.06 33.64 45.00 38.46
## [4,] 40.49 40.00 48.08 39.90 46.63 43.00 57.69 52.88
## [5,] 68.68 67.31 79.91 69.71 76.51 70.62 94.87 91.35

STEP 4: Multiple Regression Analysis

Preparation before regression analysis

  • Numeric variables normalization
# Normalize numeric variables
normalize <- function(x) {
   return ((x - min(x)) / (max(x) - min(x))) }

data.all.n <- as.data.frame(lapply(data.all[num.vars], normalize))
summary(data.all.n)
##     UTOTHRS           TENURE           HRLYEARN     
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.3509   1st Qu.:0.05858   1st Qu.:0.1147  
##  Median :0.4016   Median :0.21757   Median :0.1676  
##  Mean   :0.3634   Mean   :0.35573   Mean   :0.1973  
##  3rd Qu.:0.4016   3rd Qu.:0.59414   3rd Qu.:0.2537  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000
`%nin%` = Negate(`%in%`)
data.all.n <- cbind(data.all[names(data.all) %nin% num.vars], data.all.n)
  • Training and testing subsets by group
# TRAINING (70%) AND TESTING (30%) SETS
data.all.n09male <- data.all.n[data.all.n$SURVYEAR == 2009 & data.all.n$SEX == "Male",]
data.all.n09fem <- data.all.n[data.all.n$SURVYEAR == 2009 & data.all.n$SEX == "Female",]
data.all.n19male <- data.all.n[data.all.n$SURVYEAR == 2019 & data.all.n$SEX == "Male",]
data.all.n19fem <- data.all.n[data.all.n$SURVYEAR == 2019 & data.all.n$SEX == "Female",]

# 2009 Males
set.seed(1)
idx.09male <- sample(1:nrow(data.all.n09male), floor(0.7*nrow(data.all.n09male)))
write.csv(idx.09male, file = "Train_Idx_09Male.csv", row.names=FALSE)
train.09male <- data.all.n09male[idx.09male,]
test.09male <- data.all.n09male[-idx.09male,]

# 2009 Females
set.seed(30)
idx.09fem <- sample(1:nrow(data.all.n09fem), floor(0.7*nrow(data.all.n09fem)))
write.csv(idx.09fem, file = "Train_Idx_09Fem.csv", row.names=FALSE)
train.09fem <- data.all.n09fem[idx.09fem,]
test.09fem <- data.all.n09fem[-idx.09fem,]

# 2019 Males
set.seed(500)
idx.19male <- sample(1:nrow(data.all.n19male), floor(0.7*nrow(data.all.n19male)))
write.csv(idx.19male, file = "Train_Idx_19Male.csv", row.names=FALSE)
train.19male <- data.all.n19male[idx.19male,]
test.19male <- data.all.n19male[-idx.19male,]

# 2019 Females
set.seed(7000)
idx.19fem <- sample(1:nrow(data.all.n19fem), floor(0.7*nrow(data.all.n19fem)))
write.csv(idx.19fem, file = "Train_Idx_19Fem.csv", row.names=FALSE)
train.19fem <- data.all.n19fem[idx.19fem,]
test.19fem <- data.all.n19fem[-idx.19fem,]

Stepwise Regression (Both Directions)

  • First Model results and diagnostic plots
  • Box Cox transformation
  • Final Model results, diagnostic plots, residuals skewness, and prediction
  • Analysis by group (Males 2009, Males 2019, Females 2009, Females 2019)
# Males 2009
# Model
names(data.all)
##  [1] "REC_NUM"       "SURVYEAR"      "SURVMNTH"      "LFSSTAT"      
##  [5] "PROV"          "CMA"           "AGE_12"        "SEX"          
##  [9] "MARSTAT"       "EDUC"          "MJH"           "COWMAIN"      
## [13] "IMMIG"         "NAICS_18"      "NOC_10"        "NOC_40"       
## [17] "FTPTMAIN"      "UTOTHRS"       "TENURE"        "HRLYEARN"     
## [21] "UNION"         "PERMTEMP"      "ESTSIZE"       "FIRMSIZE"     
## [25] "SCHOOLN"       "EFAMTYPE"      "AGYOWNK"       "EDUCshort"    
## [29] "NAICS_18short" "NOC_10short"
full <- lm(HRLYEARN ~ LFSSTAT+PROV+CMA+AGE_12+MARSTAT+EDUC+MJH+COWMAIN
                     +NAICS_18+NOC_10+FTPTMAIN+UTOTHRS+TENURE+UNION+PERMTEMP
                     +ESTSIZE+FIRMSIZE+SCHOOLN+EFAMTYPE+AGYOWNK, data = train.09male)
null <- lm(HRLYEARN ~ 1, data = train.09male)
model <- stepAIC(null, scope=list(lower=null, upper=full), direction= "both", trace=F) # Trace TRUE to show steps of adding and subtracting vars
summary(model)
## 
## Call:
## lm(formula = HRLYEARN ~ NOC_10 + NAICS_18 + AGE_12 + PROV + EDUC + 
##     TENURE + ESTSIZE + FIRMSIZE + AGYOWNK + PERMTEMP + FTPTMAIN + 
##     UTOTHRS + UNION + MARSTAT + LFSSTAT + COWMAIN, data = train.09male)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.34761 -0.04505 -0.00701  0.03390  0.64416 
## 
## Coefficients:
##                                                  Estimate Std. Error t value
## (Intercept)                                     0.2573064  0.0080378  32.012
## NOC_10Business, finance & administration       -0.0980386  0.0028446 -34.464
## NOC_10Natural & applied sciences               -0.0530643  0.0028425 -18.668
## NOC_10Health                                   -0.0357390  0.0052941  -6.751
## NOC_10Educ., law, community & gov. services    -0.0479927  0.0037356 -12.847
## NOC_10Art, culture, recreation & sport         -0.0764742  0.0054402 -14.057
## NOC_10Sales & service                          -0.1004194  0.0025816 -38.898
## NOC_10Trades, transport & equipm. operators    -0.0931347  0.0025773 -36.136
## NOC_10Natural resources & agriculture          -0.0958967  0.0043085 -22.257
## NOC_10Manufacturing & utilities                -0.1145789  0.0033682 -34.018
## NAICS_18Forestry, Fishing, Min., Oil & Gas      0.0545069  0.0052637  10.355
## NAICS_18Utilities                               0.0634565  0.0071919   8.823
## NAICS_18Construction                            0.0418910  0.0056413   7.426
## NAICS_18Manufacturing durables                  0.0180903  0.0057355   3.154
## NAICS_18Manufacturing non-durables              0.0110890  0.0058769   1.887
## NAICS_18Wholesale Trade                         0.0153245  0.0059535   2.574
## NAICS_18Retail Trade                           -0.0166003  0.0056865  -2.919
## NAICS_18Transportation & Warehousing            0.0059642  0.0058085   1.027
## NAICS_18Finance, Insurance, Real Est. & Leas.   0.0223155  0.0061734   3.615
## NAICS_18Prof., Scientific & Technical Services  0.0444630  0.0060855   7.306
## NAICS_18Management, Admin. & Support           -0.0146452  0.0059094  -2.478
## NAICS_18Educational Services                    0.0075440  0.0066166   1.140
## NAICS_18Health Care & Social Assistance        -0.0170636  0.0066286  -2.574
## NAICS_18Information, Culture & Recreation       0.0037343  0.0060453   0.618
## NAICS_18Accommodation & Food Services          -0.0288712  0.0060149  -4.800
## NAICS_18Other Services                          0.0006623  0.0061136   0.108
## NAICS_18Public Administration                   0.0468728  0.0062819   7.462
## AGE_12.L                                        0.0125046  0.0047295   2.644
## AGE_12.Q                                       -0.0529037  0.0044881 -11.788
## AGE_12.C                                       -0.0193159  0.0038943  -4.960
## AGE_12^4                                       -0.0058821  0.0035415  -1.661
## AGE_12^5                                       -0.0112318  0.0032448  -3.462
## AGE_12^6                                        0.0032615  0.0029043   1.123
## AGE_12^7                                       -0.0034502  0.0025315  -1.363
## AGE_12^8                                        0.0052577  0.0021961   2.394
## AGE_12^9                                        0.0028232  0.0019463   1.451
## AGE_12^10                                       0.0042481  0.0017737   2.395
## AGE_12^11                                       0.0004379  0.0016550   0.265
## PROVPEI                                        -0.0195493  0.0047748  -4.094
## PROVNS                                         -0.0116356  0.0040612  -2.865
## PROVNB                                         -0.0099912  0.0039832  -2.508
## PROVQC                                          0.0069783  0.0034623   2.015
## PROVON                                          0.0214436  0.0033285   6.442
## PROVMB                                          0.0004315  0.0036292   0.119
## PROVSK                                          0.0250554  0.0037888   6.613
## PROVAB                                          0.0479803  0.0035522  13.507
## PROVBC                                          0.0316994  0.0035888   8.833
## EDUC.L                                          0.0632672  0.0028592  22.127
## EDUC.Q                                          0.0166378  0.0025775   6.455
## EDUC.C                                          0.0062549  0.0020482   3.054
## EDUC^4                                         -0.0063464  0.0018846  -3.368
## EDUC^5                                         -0.0038765  0.0013888  -2.791
## EDUC^6                                         -0.0024849  0.0015972  -1.556
## TENURE                                          0.0350170  0.0019904  17.593
## ESTSIZE.L                                       0.0208994  0.0017118  12.209
## ESTSIZE.Q                                       0.0046253  0.0014286   3.238
## ESTSIZE.C                                       0.0012085  0.0012716   0.950
## FIRMSIZE.L                                      0.0145086  0.0015441   9.396
## FIRMSIZE.Q                                     -0.0002057  0.0014856  -0.138
## FIRMSIZE.C                                      0.0004759  0.0015209   0.313
## AGYOWNK.L                                      -0.0076283  0.0017404  -4.383
## AGYOWNK.Q                                      -0.0033694  0.0018326  -1.839
## AGYOWNK.C                                      -0.0011279  0.0019759  -0.571
## AGYOWNK^4                                      -0.0065809  0.0020133  -3.269
## PERMTEMPTemp. season                           -0.0172820  0.0026052  -6.634
## PERMTEMPTemp. contract                         -0.0123026  0.0025365  -4.850
## PERMTEMPTemp. casual                           -0.0136647  0.0037091  -3.684
## FTPTMAINPart-time                              -0.0262977  0.0027109  -9.701
## UTOTHRS                                        -0.0529196  0.0072706  -7.279
## UNIONNot member but covered                     0.0067830  0.0039039   1.738
## UNIONNon-unionized                             -0.0066323  0.0015134  -4.382
## MARSTATCommon-law                              -0.0015794  0.0018519  -0.853
## MARSTATWidowed                                 -0.0005856  0.0081516  -0.072
## MARSTATSeparated                                0.0026465  0.0037717   0.702
## MARSTATDivorced                                -0.0027376  0.0031783  -0.861
## MARSTATSingle, NM                              -0.0096063  0.0019417  -4.947
## LFSSTATEmployed, absent from work              -0.0062774  0.0024873  -2.524
## COWMAINPrivate sector                           0.0055096  0.0027263   2.021
##                                                Pr(>|t|)    
## (Intercept)                                     < 2e-16 ***
## NOC_10Business, finance & administration        < 2e-16 ***
## NOC_10Natural & applied sciences                < 2e-16 ***
## NOC_10Health                                   1.51e-11 ***
## NOC_10Educ., law, community & gov. services     < 2e-16 ***
## NOC_10Art, culture, recreation & sport          < 2e-16 ***
## NOC_10Sales & service                           < 2e-16 ***
## NOC_10Trades, transport & equipm. operators     < 2e-16 ***
## NOC_10Natural resources & agriculture           < 2e-16 ***
## NOC_10Manufacturing & utilities                 < 2e-16 ***
## NAICS_18Forestry, Fishing, Min., Oil & Gas      < 2e-16 ***
## NAICS_18Utilities                               < 2e-16 ***
## NAICS_18Construction                           1.17e-13 ***
## NAICS_18Manufacturing durables                 0.001612 ** 
## NAICS_18Manufacturing non-durables             0.059191 .  
## NAICS_18Wholesale Trade                        0.010060 *  
## NAICS_18Retail Trade                           0.003513 ** 
## NAICS_18Transportation & Warehousing           0.304528    
## NAICS_18Finance, Insurance, Real Est. & Leas.  0.000301 ***
## NAICS_18Prof., Scientific & Technical Services 2.85e-13 ***
## NAICS_18Management, Admin. & Support           0.013210 *  
## NAICS_18Educational Services                   0.254235    
## NAICS_18Health Care & Social Assistance        0.010054 *  
## NAICS_18Information, Culture & Recreation      0.536771    
## NAICS_18Accommodation & Food Services          1.60e-06 ***
## NAICS_18Other Services                         0.913735    
## NAICS_18Public Administration                  8.92e-14 ***
## AGE_12.L                                       0.008200 ** 
## AGE_12.Q                                        < 2e-16 ***
## AGE_12.C                                       7.11e-07 ***
## AGE_12^4                                       0.096747 .  
## AGE_12^5                                       0.000538 ***
## AGE_12^6                                       0.261464    
## AGE_12^7                                       0.172931    
## AGE_12^8                                       0.016670 *  
## AGE_12^9                                       0.146917    
## AGE_12^10                                      0.016627 *  
## AGE_12^11                                      0.791338    
## PROVPEI                                        4.25e-05 ***
## PROVNS                                         0.004174 ** 
## PROVNB                                         0.012139 *  
## PROVQC                                         0.043867 *  
## PROVON                                         1.20e-10 ***
## PROVMB                                         0.905348    
## PROVSK                                         3.87e-11 ***
## PROVAB                                          < 2e-16 ***
## PROVBC                                          < 2e-16 ***
## EDUC.L                                          < 2e-16 ***
## EDUC.Q                                         1.11e-10 ***
## EDUC.C                                         0.002263 ** 
## EDUC^4                                         0.000760 ***
## EDUC^5                                         0.005256 ** 
## EDUC^6                                         0.119776    
## TENURE                                          < 2e-16 ***
## ESTSIZE.L                                       < 2e-16 ***
## ESTSIZE.Q                                      0.001208 ** 
## ESTSIZE.C                                      0.341907    
## FIRMSIZE.L                                      < 2e-16 ***
## FIRMSIZE.Q                                     0.889889    
## FIRMSIZE.C                                     0.754376    
## AGYOWNK.L                                      1.18e-05 ***
## AGYOWNK.Q                                      0.065994 .  
## AGYOWNK.C                                      0.568126    
## AGYOWNK^4                                      0.001082 ** 
## PERMTEMPTemp. season                           3.37e-11 ***
## PERMTEMPTemp. contract                         1.24e-06 ***
## PERMTEMPTemp. casual                           0.000230 ***
## FTPTMAINPart-time                               < 2e-16 ***
## UTOTHRS                                        3.51e-13 ***
## UNIONNot member but covered                    0.082314 .  
## UNIONNon-unionized                             1.18e-05 ***
## MARSTATCommon-law                              0.393758    
## MARSTATWidowed                                 0.942728    
## MARSTATSeparated                               0.482893    
## MARSTATDivorced                                0.389067    
## MARSTATSingle, NM                              7.58e-07 ***
## LFSSTATEmployed, absent from work              0.011620 *  
## COWMAINPrivate sector                          0.043304 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0771 on 18781 degrees of freedom
## Multiple R-squared:  0.4733, Adjusted R-squared:  0.4711 
## F-statistic: 219.1 on 77 and 18781 DF,  p-value: < 2.2e-16
# Investigating SCHOOLN Coefficient NA
str(data.all$SCHOOLN)
##  Factor w/ 4 levels "Non-student",..: 1 1 1 1 1 1 1 1 1 1 ...
nrow(train.09male[train.09male$SCHOOLN=="Unknown",])
## [1] 407
# Diagnostic Plots
# par(mfrow = c(2, 2))
par(mar=c(5.1, 4.1, 4.1, 2.1))
plot(model, lwd = 6)

# Box Cox Transformation (Selected attributes from Stepwise Regression)
bc.model <- lm(formula = HRLYEARN ~ NOC_10 + NAICS_18 + AGE_12 + PROV + EDUC + 
                 TENURE + ESTSIZE + FIRMSIZE + AGYOWNK + PERMTEMP + FTPTMAIN + 
                 UTOTHRS + UNION + MARSTAT + LFSSTAT + COWMAIN
               , data = train.09male)
bc <- boxcox(bc.model, lambda = seq(-3, 3))

bc$x[which(bc$y==max(bc$y))]
## [1] 0.2121212
new.model <- lm(formula = (HRLYEARN)^0.21 ~ NOC_10 + NAICS_18 + AGE_12 + PROV + EDUC + 
                    TENURE + ESTSIZE + AGYOWNK + FIRMSIZE + PERMTEMP + FTPTMAIN + 
                    UTOTHRS + MARSTAT + UNION + LFSSTAT + MJH, data = train.09male)
summary(new.model)
## 
## Call:
## lm(formula = (HRLYEARN)^0.21 ~ NOC_10 + NAICS_18 + AGE_12 + PROV + 
##     EDUC + TENURE + ESTSIZE + AGYOWNK + FIRMSIZE + PERMTEMP + 
##     FTPTMAIN + UTOTHRS + MARSTAT + UNION + LFSSTAT + MJH, data = train.09male)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.39703 -0.03265 -0.00008  0.03233  0.24573 
## 
## Coefficients:
##                                                  Estimate Std. Error t value
## (Intercept)                                     0.7177075  0.0055964 128.244
## NOC_10Business, finance & administration       -0.0622680  0.0020346 -30.604
## NOC_10Natural & applied sciences               -0.0302886  0.0020329 -14.899
## NOC_10Health                                   -0.0205009  0.0037842  -5.417
## NOC_10Educ., law, community & gov. services    -0.0301744  0.0026716 -11.294
## NOC_10Art, culture, recreation & sport         -0.0436001  0.0038923 -11.202
## NOC_10Sales & service                          -0.0690683  0.0018462 -37.411
## NOC_10Trades, transport & equipm. operators    -0.0571677  0.0018431 -31.018
## NOC_10Natural resources & agriculture          -0.0583828  0.0030814 -18.947
## NOC_10Manufacturing & utilities                -0.0756620  0.0024086 -31.414
## NAICS_18Forestry, Fishing, Min., Oil & Gas      0.0491871  0.0037648  13.065
## NAICS_18Utilities                               0.0531426  0.0049322  10.775
## NAICS_18Construction                            0.0464192  0.0040344  11.506
## NAICS_18Manufacturing durables                  0.0289088  0.0041007   7.050
## NAICS_18Manufacturing non-durables              0.0211285  0.0042017   5.029
## NAICS_18Wholesale Trade                         0.0257190  0.0042575   6.041
## NAICS_18Retail Trade                           -0.0022691  0.0040668  -0.558
## NAICS_18Transportation & Warehousing            0.0149749  0.0041431   3.614
## NAICS_18Finance, Insurance, Real Est. & Leas.   0.0267711  0.0044145   6.064
## NAICS_18Prof., Scientific & Technical Services  0.0455705  0.0043521  10.471
## NAICS_18Management, Admin. & Support           -0.0026267  0.0042262  -0.622
## NAICS_18Educational Services                    0.0177953  0.0044631   3.987
## NAICS_18Health Care & Social Assistance         0.0014326  0.0046158   0.310
## NAICS_18Information, Culture & Recreation       0.0137391  0.0043174   3.182
## NAICS_18Accommodation & Food Services          -0.0137698  0.0043017  -3.201
## NAICS_18Other Services                          0.0136806  0.0043723   3.129
## NAICS_18Public Administration                   0.0438451  0.0041542  10.554
## AGE_12.L                                        0.0128548  0.0033823   3.801
## AGE_12.Q                                       -0.0527933  0.0032098 -16.447
## AGE_12.C                                       -0.0049210  0.0027853  -1.767
## AGE_12^4                                       -0.0109511  0.0025330  -4.323
## AGE_12^5                                       -0.0080144  0.0023207  -3.453
## AGE_12^6                                        0.0030622  0.0020772   1.474
## AGE_12^7                                       -0.0042250  0.0018107  -2.333
## AGE_12^8                                        0.0039378  0.0015706   2.507
## AGE_12^9                                        0.0011558  0.0013919   0.830
## AGE_12^10                                       0.0022889  0.0012685   1.804
## AGE_12^11                                      -0.0003042  0.0011836  -0.257
## PROVPEI                                        -0.0152036  0.0034159  -4.451
## PROVNS                                         -0.0074142  0.0029051  -2.552
## PROVNB                                         -0.0069053  0.0028496  -2.423
## PROVQC                                          0.0077757  0.0024785   3.137
## PROVON                                          0.0177324  0.0023829   7.442
## PROVMB                                          0.0038445  0.0025985   1.479
## PROVSK                                          0.0227459  0.0027121   8.387
## PROVAB                                          0.0392619  0.0025429  15.440
## PROVBC                                          0.0271287  0.0025688  10.561
## EDUC.L                                          0.0415791  0.0020446  20.336
## EDUC.Q                                          0.0043983  0.0018435   2.386
## EDUC.C                                          0.0024609  0.0014646   1.680
## EDUC^4                                         -0.0054203  0.0013480  -4.021
## EDUC^5                                         -0.0026859  0.0009933  -2.704
## EDUC^6                                         -0.0005705  0.0011422  -0.499
## TENURE                                          0.0284930  0.0014218  20.041
## ESTSIZE.L                                       0.0150023  0.0012232  12.265
## ESTSIZE.Q                                       0.0025721  0.0010208   2.520
## ESTSIZE.C                                       0.0009307  0.0009092   1.024
## AGYOWNK.L                                      -0.0054457  0.0012447  -4.375
## AGYOWNK.Q                                      -0.0020668  0.0013108  -1.577
## AGYOWNK.C                                      -0.0007248  0.0014131  -0.513
## AGYOWNK^4                                      -0.0035055  0.0014398  -2.435
## FIRMSIZE.L                                      0.0099756  0.0011033   9.042
## FIRMSIZE.Q                                     -0.0006254  0.0010624  -0.589
## FIRMSIZE.C                                      0.0003912  0.0010877   0.360
## PERMTEMPTemp. season                           -0.0160308  0.0018627  -8.606
## PERMTEMPTemp. contract                         -0.0097256  0.0018138  -5.362
## PERMTEMPTemp. casual                           -0.0131244  0.0026522  -4.949
## FTPTMAINPart-time                              -0.0257426  0.0020261 -12.706
## UTOTHRS                                        -0.0320650  0.0056232  -5.702
## MARSTATCommon-law                              -0.0009476  0.0013244  -0.716
## MARSTATWidowed                                 -0.0006996  0.0058299  -0.120
## MARSTATSeparated                               -0.0003618  0.0026973  -0.134
## MARSTATDivorced                                -0.0020224  0.0022730  -0.890
## MARSTATSingle, NM                              -0.0081825  0.0013886  -5.892
## UNIONNot member but covered                     0.0003877  0.0027905   0.139
## UNIONNon-unionized                             -0.0107784  0.0010635 -10.135
## LFSSTATEmployed, absent from work              -0.0034503  0.0017794  -1.939
## MJHMultiple jobholder                          -0.0029147  0.0022009  -1.324
##                                                Pr(>|t|)    
## (Intercept)                                     < 2e-16 ***
## NOC_10Business, finance & administration        < 2e-16 ***
## NOC_10Natural & applied sciences                < 2e-16 ***
## NOC_10Health                                   6.12e-08 ***
## NOC_10Educ., law, community & gov. services     < 2e-16 ***
## NOC_10Art, culture, recreation & sport          < 2e-16 ***
## NOC_10Sales & service                           < 2e-16 ***
## NOC_10Trades, transport & equipm. operators     < 2e-16 ***
## NOC_10Natural resources & agriculture           < 2e-16 ***
## NOC_10Manufacturing & utilities                 < 2e-16 ***
## NAICS_18Forestry, Fishing, Min., Oil & Gas      < 2e-16 ***
## NAICS_18Utilities                               < 2e-16 ***
## NAICS_18Construction                            < 2e-16 ***
## NAICS_18Manufacturing durables                 1.85e-12 ***
## NAICS_18Manufacturing non-durables             4.99e-07 ***
## NAICS_18Wholesale Trade                        1.56e-09 ***
## NAICS_18Retail Trade                           0.576869    
## NAICS_18Transportation & Warehousing           0.000302 ***
## NAICS_18Finance, Insurance, Real Est. & Leas.  1.35e-09 ***
## NAICS_18Prof., Scientific & Technical Services  < 2e-16 ***
## NAICS_18Management, Admin. & Support           0.534263    
## NAICS_18Educational Services                   6.71e-05 ***
## NAICS_18Health Care & Social Assistance        0.756280    
## NAICS_18Information, Culture & Recreation      0.001464 ** 
## NAICS_18Accommodation & Food Services          0.001372 ** 
## NAICS_18Other Services                         0.001757 ** 
## NAICS_18Public Administration                   < 2e-16 ***
## AGE_12.L                                       0.000145 ***
## AGE_12.Q                                        < 2e-16 ***
## AGE_12.C                                       0.077281 .  
## AGE_12^4                                       1.54e-05 ***
## AGE_12^5                                       0.000555 ***
## AGE_12^6                                       0.140438    
## AGE_12^7                                       0.019641 *  
## AGE_12^8                                       0.012176 *  
## AGE_12^9                                       0.406325    
## AGE_12^10                                      0.071173 .  
## AGE_12^11                                      0.797203    
## PROVPEI                                        8.60e-06 ***
## PROVNS                                         0.010716 *  
## PROVNB                                         0.015390 *  
## PROVQC                                         0.001708 ** 
## PROVON                                         1.04e-13 ***
## PROVMB                                         0.139026    
## PROVSK                                          < 2e-16 ***
## PROVAB                                          < 2e-16 ***
## PROVBC                                          < 2e-16 ***
## EDUC.L                                          < 2e-16 ***
## EDUC.Q                                         0.017048 *  
## EDUC.C                                         0.092921 .  
## EDUC^4                                         5.82e-05 ***
## EDUC^5                                         0.006856 ** 
## EDUC^6                                         0.617447    
## TENURE                                          < 2e-16 ***
## ESTSIZE.L                                       < 2e-16 ***
## ESTSIZE.Q                                      0.011754 *  
## ESTSIZE.C                                      0.306046    
## AGYOWNK.L                                      1.22e-05 ***
## AGYOWNK.Q                                      0.114870    
## AGYOWNK.C                                      0.608015    
## AGYOWNK^4                                      0.014913 *  
## FIRMSIZE.L                                      < 2e-16 ***
## FIRMSIZE.Q                                     0.556072    
## FIRMSIZE.C                                     0.719102    
## PERMTEMPTemp. season                            < 2e-16 ***
## PERMTEMPTemp. contract                         8.32e-08 ***
## PERMTEMPTemp. casual                           7.54e-07 ***
## FTPTMAINPart-time                               < 2e-16 ***
## UTOTHRS                                        1.20e-08 ***
## MARSTATCommon-law                              0.474309    
## MARSTATWidowed                                 0.904488    
## MARSTATSeparated                               0.893291    
## MARSTATDivorced                                0.373613    
## MARSTATSingle, NM                              3.87e-09 ***
## UNIONNot member but covered                    0.889507    
## UNIONNon-unionized                              < 2e-16 ***
## LFSSTATEmployed, absent from work              0.052513 .  
## MJHMultiple jobholder                          0.185417    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05514 on 18781 degrees of freedom
## Multiple R-squared:  0.529,  Adjusted R-squared:  0.5271 
## F-statistic: 273.9 on 77 and 18781 DF,  p-value: < 2.2e-16
par(mar=c(5.1, 4.1, 4.1, 2.1))
plot(new.model, lwd = 6)

# Residuals Skewness
# Before Transformation:
skewness(model$residuals)
## [1] 1.105503
# After Transformation
skewness(new.model$residuals)
## [1] -0.1655788
# Prediction
prediction <- predict(new.model, interval = "prediction", newdata = test.09male)
# Errors
errors <- prediction[,"fit"] - (test.09male$HRLYEARN^0.21)
hist(errors)

rmse <- sqrt(sum((errors)^2)/nrow(test.09male))
mae <- (1/nrow(test.09male))*sum(abs(errors))
diff.percent <- 100*(abs(errors)/(test.09male$HRLYEARN^0.21))
diff.25 <- length(diff.percent[diff.percent<=25])/nrow(test.09male)
paste("RMSE:", rmse)
## [1] "RMSE: 0.0555053839803959"
paste("MAE:", mae)
## [1] "MAE: 0.0422321988304378"
paste("Percentage of cases with less than 25% error:", diff.25*100)
## [1] "Percentage of cases with less than 25% error: 99.0473833972535"
# Males 2019
# Model
full <- lm(HRLYEARN ~ LFSSTAT+PROV+CMA+AGE_12+MARSTAT+EDUC+MJH+COWMAIN
           +NAICS_18+NOC_10+FTPTMAIN+UTOTHRS+TENURE+UNION+PERMTEMP
           +ESTSIZE+FIRMSIZE+SCHOOLN+EFAMTYPE+AGYOWNK, data = train.19male)
null <- lm(HRLYEARN ~ 1, data = train.19male)
model <- stepAIC(null, scope=list(lower=null, upper=full), direction= "both", trace=F) # Trace TRUE to show steps of adding and subtracting vars
summary(model)
## 
## Call:
## lm(formula = HRLYEARN ~ NOC_10 + NAICS_18 + AGE_12 + PROV + EDUC + 
##     TENURE + FIRMSIZE + MARSTAT + ESTSIZE + FTPTMAIN + UTOTHRS + 
##     PERMTEMP + EFAMTYPE + SCHOOLN + CMA + UNION + AGYOWNK + MJH + 
##     COWMAIN, data = train.19male)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.40507 -0.05420 -0.00902  0.04277  0.61524 
## 
## Coefficients: (1 not defined because of singularities)
##                                                  Estimate Std. Error t value
## (Intercept)                                     3.407e-01  1.131e-02  30.131
## NOC_10Business, finance & administration       -1.203e-01  3.615e-03 -33.271
## NOC_10Natural & applied sciences               -8.063e-02  3.482e-03 -23.158
## NOC_10Health                                   -7.901e-02  6.164e-03 -12.816
## NOC_10Educ., law, community & gov. services    -7.289e-02  4.118e-03 -17.702
## NOC_10Art, culture, recreation & sport         -1.332e-01  6.782e-03 -19.639
## NOC_10Sales & service                          -1.559e-01  3.381e-03 -46.110
## NOC_10Trades, transport & equipm. operators    -1.277e-01  3.247e-03 -39.315
## NOC_10Natural resources & agriculture          -1.441e-01  5.212e-03 -27.650
## NOC_10Manufacturing & utilities                -1.451e-01  4.223e-03 -34.358
## NAICS_18Forestry, Fishing, Min., Oil & Gas      9.717e-02  6.317e-03  15.383
## NAICS_18Utilities                               1.156e-01  8.987e-03  12.863
## NAICS_18Construction                            5.202e-02  6.828e-03   7.619
## NAICS_18Manufacturing durables                  1.660e-02  7.026e-03   2.363
## NAICS_18Manufacturing non-durables              1.504e-02  7.227e-03   2.081
## NAICS_18Wholesale Trade                         3.235e-02  7.299e-03   4.432
## NAICS_18Retail Trade                           -1.047e-02  6.901e-03  -1.516
## NAICS_18Transportation & Warehousing            1.163e-02  7.012e-03   1.658
## NAICS_18Finance, Insurance, Real Est. & Leas.   3.876e-02  7.369e-03   5.261
## NAICS_18Prof., Scientific & Technical Services  4.925e-02  7.220e-03   6.821
## NAICS_18Management, Admin. & Support           -1.136e-04  7.144e-03  -0.016
## NAICS_18Educational Services                    2.013e-02  7.980e-03   2.523
## NAICS_18Health Care & Social Assistance        -1.633e-02  7.860e-03  -2.078
## NAICS_18Information, Culture & Recreation       7.651e-03  7.320e-03   1.045
## NAICS_18Accommodation & Food Services          -1.523e-02  7.354e-03  -2.071
## NAICS_18Other Services                          1.465e-02  7.517e-03   1.949
## NAICS_18Public Administration                   4.074e-02  7.652e-03   5.325
## AGE_12.L                                       -1.319e-03  5.273e-03  -0.250
## AGE_12.Q                                       -3.673e-02  4.741e-03  -7.748
## AGE_12.C                                       -1.642e-02  3.870e-03  -4.242
## AGE_12^4                                        6.800e-03  3.452e-03   1.970
## AGE_12^5                                       -1.105e-02  3.169e-03  -3.485
## AGE_12^6                                       -3.398e-04  2.906e-03  -0.117
## AGE_12^7                                       -6.130e-03  2.628e-03  -2.333
## AGE_12^8                                       -4.770e-03  2.373e-03  -2.010
## AGE_12^9                                       -9.932e-05  2.193e-03  -0.045
## AGE_12^10                                       7.281e-04  2.107e-03   0.346
## AGE_12^11                                       2.274e-03  2.091e-03   1.087
## PROVPEI                                        -2.745e-02  5.707e-03  -4.811
## PROVNS                                         -2.001e-02  4.921e-03  -4.065
## PROVNB                                         -2.706e-02  4.925e-03  -5.494
## PROVQC                                         -1.403e-03  4.290e-03  -0.327
## PROVON                                          1.369e-02  4.109e-03   3.331
## PROVMB                                         -7.232e-04  4.542e-03  -0.159
## PROVSK                                          1.979e-02  4.611e-03   4.292
## PROVAB                                          5.475e-02  4.358e-03  12.564
## PROVBC                                          2.868e-02  4.494e-03   6.382
## EDUC.L                                          6.938e-02  3.600e-03  19.273
## EDUC.Q                                          2.242e-02  3.325e-03   6.743
## EDUC.C                                          6.487e-03  2.621e-03   2.476
## EDUC^4                                         -3.277e-03  2.519e-03  -1.301
## EDUC^5                                          2.334e-04  1.750e-03   0.133
## EDUC^6                                          1.265e-03  2.251e-03   0.562
## TENURE                                          4.989e-02  2.429e-03  20.541
## FIRMSIZE.L                                      2.089e-02  1.935e-03  10.795
## FIRMSIZE.Q                                     -1.287e-03  1.793e-03  -0.718
## FIRMSIZE.C                                      1.915e-03  1.815e-03   1.055
## MARSTATCommon-law                              -4.560e-03  2.120e-03  -2.151
## MARSTATWidowed                                 -1.986e-02  1.006e-02  -1.975
## MARSTATSeparated                                2.847e-03  5.607e-03   0.508
## MARSTATDivorced                                 7.375e-03  4.861e-03   1.517
## MARSTATSingle, NM                              -1.254e-02  3.485e-03  -3.600
## ESTSIZE.L                                       2.187e-02  2.073e-03  10.549
## ESTSIZE.Q                                       8.261e-03  1.716e-03   4.813
## ESTSIZE.C                                      -1.445e-03  1.525e-03  -0.948
## FTPTMAINPart-time                              -2.867e-02  3.536e-03  -8.108
## UTOTHRS                                        -4.967e-02  9.622e-03  -5.163
## PERMTEMPTemp. season                           -1.753e-02  3.422e-03  -5.123
## PERMTEMPTemp. contract                         -5.953e-03  3.076e-03  -1.936
## PERMTEMPTemp. casual                           -1.216e-02  4.287e-03  -2.837
## EFAMTYPEHWDENC                                  4.484e-03  3.668e-03   1.223
## EFAMTYPEHWDE17                                 -1.599e-03  4.888e-03  -0.327
## EFAMTYPEHWDE24                                 -7.134e-03  4.590e-03  -1.554
## EFAMTYPEHWSHNC                                  1.056e-02  4.210e-03   2.509
## EFAMTYPEHWSH17                                  1.851e-03  5.532e-03   0.335
## EFAMTYPEHWSH24                                  5.838e-03  7.291e-03   0.801
## EFAMTYPEHWSWNC                                 -1.848e-03  1.011e-02  -0.183
## EFAMTYPEHWSW17                                 -2.988e-03  1.605e-02  -0.186
## EFAMTYPEHWSW24                                  3.075e-03  1.260e-02   0.244
## EFAMTYPEHWNENC                                 -1.652e-02  7.659e-03  -2.157
## EFAMTYPEHWNE17                                 -1.569e-02  2.228e-02  -0.704
## EFAMTYPEHWNE24                                  6.517e-03  1.872e-02   0.348
## EFAMTYPESPE17                                   5.415e-03  5.872e-03   0.922
## EFAMTYPESPE24                                  -5.940e-03  6.393e-03  -0.929
## EFAMTYPESPN17                                   2.430e-03  1.352e-02   0.180
## EFAMTYPESPN24                                  -9.253e-03  1.563e-02  -0.592
## EFAMTYPEOther                                  -1.002e-02  3.194e-03  -3.139
## SCHOOLNFull-time student                       -4.928e-03  4.396e-03  -1.121
## SCHOOLNPart-time student                       -2.251e-02  5.771e-03  -3.901
## SCHOOLNUnknown                                         NA         NA      NA
## CMAToronto                                     -8.723e-03  5.010e-03  -1.741
## CMAVancouver                                   -1.860e-02  5.705e-03  -3.261
## CMAOther                                       -4.095e-03  3.881e-03  -1.055
## UNIONNot member but covered                     6.762e-03  5.095e-03   1.327
## UNIONNon-unionized                             -5.923e-03  1.897e-03  -3.121
## AGYOWNK.L                                      -8.096e-03  4.649e-03  -1.741
## AGYOWNK.Q                                      -9.426e-03  3.163e-03  -2.980
## AGYOWNK.C                                      -3.479e-03  3.547e-03  -0.981
## AGYOWNK^4                                      -4.587e-04  3.344e-03  -0.137
## MJHMultiple jobholder                          -5.659e-03  3.572e-03  -1.584
## COWMAINPrivate sector                           4.808e-03  3.337e-03   1.441
##                                                Pr(>|t|)    
## (Intercept)                                     < 2e-16 ***
## NOC_10Business, finance & administration        < 2e-16 ***
## NOC_10Natural & applied sciences                < 2e-16 ***
## NOC_10Health                                    < 2e-16 ***
## NOC_10Educ., law, community & gov. services     < 2e-16 ***
## NOC_10Art, culture, recreation & sport          < 2e-16 ***
## NOC_10Sales & service                           < 2e-16 ***
## NOC_10Trades, transport & equipm. operators     < 2e-16 ***
## NOC_10Natural resources & agriculture           < 2e-16 ***
## NOC_10Manufacturing & utilities                 < 2e-16 ***
## NAICS_18Forestry, Fishing, Min., Oil & Gas      < 2e-16 ***
## NAICS_18Utilities                               < 2e-16 ***
## NAICS_18Construction                           2.69e-14 ***
## NAICS_18Manufacturing durables                 0.018155 *  
## NAICS_18Manufacturing non-durables             0.037463 *  
## NAICS_18Wholesale Trade                        9.40e-06 ***
## NAICS_18Retail Trade                           0.129426    
## NAICS_18Transportation & Warehousing           0.097236 .  
## NAICS_18Finance, Insurance, Real Est. & Leas.  1.45e-07 ***
## NAICS_18Prof., Scientific & Technical Services 9.31e-12 ***
## NAICS_18Management, Admin. & Support           0.987312    
## NAICS_18Educational Services                   0.011651 *  
## NAICS_18Health Care & Social Assistance        0.037720 *  
## NAICS_18Information, Culture & Recreation      0.295910    
## NAICS_18Accommodation & Food Services          0.038363 *  
## NAICS_18Other Services                         0.051281 .  
## NAICS_18Public Administration                  1.02e-07 ***
## AGE_12.L                                       0.802421    
## AGE_12.Q                                       9.87e-15 ***
## AGE_12.C                                       2.23e-05 ***
## AGE_12^4                                       0.048863 *  
## AGE_12^5                                       0.000493 ***
## AGE_12^6                                       0.906898    
## AGE_12^7                                       0.019683 *  
## AGE_12^8                                       0.044455 *  
## AGE_12^9                                       0.963875    
## AGE_12^10                                      0.729696    
## AGE_12^11                                      0.276856    
## PROVPEI                                        1.52e-06 ***
## PROVNS                                         4.83e-05 ***
## PROVNB                                         3.99e-08 ***
## PROVQC                                         0.743617    
## PROVON                                         0.000866 ***
## PROVMB                                         0.873489    
## PROVSK                                         1.78e-05 ***
## PROVAB                                          < 2e-16 ***
## PROVBC                                         1.79e-10 ***
## EDUC.L                                          < 2e-16 ***
## EDUC.Q                                         1.60e-11 ***
## EDUC.C                                         0.013310 *  
## EDUC^4                                         0.193324    
## EDUC^5                                         0.893877    
## EDUC^6                                         0.574242    
## TENURE                                          < 2e-16 ***
## FIRMSIZE.L                                      < 2e-16 ***
## FIRMSIZE.Q                                     0.473011    
## FIRMSIZE.C                                     0.291378    
## MARSTATCommon-law                              0.031491 *  
## MARSTATWidowed                                 0.048297 *  
## MARSTATSeparated                               0.611674    
## MARSTATDivorced                                0.129261    
## MARSTATSingle, NM                              0.000319 ***
## ESTSIZE.L                                       < 2e-16 ***
## ESTSIZE.Q                                      1.50e-06 ***
## ESTSIZE.C                                      0.343338    
## FTPTMAINPart-time                              5.49e-16 ***
## UTOTHRS                                        2.46e-07 ***
## PERMTEMPTemp. season                           3.04e-07 ***
## PERMTEMPTemp. contract                         0.052932 .  
## PERMTEMPTemp. casual                           0.004557 ** 
## EFAMTYPEHWDENC                                 0.221449    
## EFAMTYPEHWDE17                                 0.743525    
## EFAMTYPEHWDE24                                 0.120146    
## EFAMTYPEHWSHNC                                 0.012119 *  
## EFAMTYPEHWSH17                                 0.737991    
## EFAMTYPEHWSH24                                 0.423270    
## EFAMTYPEHWSWNC                                 0.854954    
## EFAMTYPEHWSW17                                 0.852334    
## EFAMTYPEHWSW24                                 0.807214    
## EFAMTYPEHWNENC                                 0.030993 *  
## EFAMTYPEHWNE17                                 0.481223    
## EFAMTYPEHWNE24                                 0.727802    
## EFAMTYPESPE17                                  0.356429    
## EFAMTYPESPE24                                  0.352819    
## EFAMTYPESPN17                                  0.857419    
## EFAMTYPESPN24                                  0.553825    
## EFAMTYPEOther                                  0.001698 ** 
## SCHOOLNFull-time student                       0.262291    
## SCHOOLNPart-time student                       9.61e-05 ***
## SCHOOLNUnknown                                       NA    
## CMAToronto                                     0.081694 .  
## CMAVancouver                                   0.001113 ** 
## CMAOther                                       0.291270    
## UNIONNot member but covered                    0.184417    
## UNIONNon-unionized                             0.001803 ** 
## AGYOWNK.L                                      0.081644 .  
## AGYOWNK.Q                                      0.002886 ** 
## AGYOWNK.C                                      0.326718    
## AGYOWNK^4                                      0.890889    
## MJHMultiple jobholder                          0.113172    
## COWMAINPrivate sector                          0.149588    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09074 on 17749 degrees of freedom
## Multiple R-squared:  0.4989, Adjusted R-squared:  0.4961 
## F-statistic: 178.5 on 99 and 17749 DF,  p-value: < 2.2e-16
# Diagnostic Plots
# par(mfrow = c(2, 2))
par(mar=c(5.1, 4.1, 4.1, 2.1))
plot(model, lwd = 6)

# Box Cox Transformation (Selected attributes from Stepwise Regression)
bc.model <- lm(formula = HRLYEARN ~ NOC_10 + NAICS_18 + AGE_12 + PROV + EDUC + 
                 TENURE + FIRMSIZE + MARSTAT + ESTSIZE + FTPTMAIN + UTOTHRS + 
                 PERMTEMP + EFAMTYPE + SCHOOLN + CMA + UNION + AGYOWNK + MJH + 
                 COWMAIN
               , data = train.19male)
par(mar=c(15,5,2,1))
bc <- boxcox(bc.model, lambda = seq(-3, 3))

bc$x[which(bc$y==max(bc$y))]
## [1] -0.03030303
new.model <- lm(formula = (HRLYEARN)^-0.03 ~ NOC_10 + NAICS_18 + AGE_12 + PROV + EDUC + 
                  TENURE + FIRMSIZE + MARSTAT + ESTSIZE + FTPTMAIN + UTOTHRS + 
                  PERMTEMP + EFAMTYPE + SCHOOLN + CMA + UNION + AGYOWNK + MJH + 
                  COWMAIN, data = train.19male)
summary(new.model)
## 
## Call:
## lm(formula = (HRLYEARN)^-0.03 ~ NOC_10 + NAICS_18 + AGE_12 + 
##     PROV + EDUC + TENURE + FIRMSIZE + MARSTAT + ESTSIZE + FTPTMAIN + 
##     UTOTHRS + PERMTEMP + EFAMTYPE + SCHOOLN + CMA + UNION + AGYOWNK + 
##     MJH + COWMAIN, data = train.19male)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.052657 -0.006728 -0.000007  0.006521  0.108772 
## 
## Coefficients: (1 not defined because of singularities)
##                                                  Estimate Std. Error t value
## (Intercept)                                     1.039e+00  1.344e-03 772.595
## NOC_10Business, finance & administration        1.167e-02  4.298e-04  27.144
## NOC_10Natural & applied sciences                6.925e-03  4.140e-04  16.728
## NOC_10Health                                    6.627e-03  7.330e-04   9.041
## NOC_10Educ., law, community & gov. services     6.691e-03  4.896e-04  13.667
## NOC_10Art, culture, recreation & sport          1.262e-02  8.064e-04  15.647
## NOC_10Sales & service                           1.684e-02  4.020e-04  41.902
## NOC_10Trades, transport & equipm. operators     1.218e-02  3.861e-04  31.544
## NOC_10Natural resources & agriculture           1.423e-02  6.197e-04  22.957
## NOC_10Manufacturing & utilities                 1.498e-02  5.021e-04  29.834
## NAICS_18Forestry, Fishing, Min., Oil & Gas     -1.241e-02  7.511e-04 -16.526
## NAICS_18Utilities                              -1.314e-02  1.069e-03 -12.298
## NAICS_18Construction                           -8.587e-03  8.119e-04 -10.576
## NAICS_18Manufacturing durables                 -4.158e-03  8.355e-04  -4.977
## NAICS_18Manufacturing non-durables             -3.289e-03  8.594e-04  -3.827
## NAICS_18Wholesale Trade                        -6.059e-03  8.678e-04  -6.982
## NAICS_18Retail Trade                            1.044e-03  8.206e-04   1.272
## NAICS_18Transportation & Warehousing           -2.591e-03  8.338e-04  -3.108
## NAICS_18Finance, Insurance, Real Est. & Leas.  -6.011e-03  8.762e-04  -6.861
## NAICS_18Prof., Scientific & Technical Services -7.569e-03  8.585e-04  -8.816
## NAICS_18Management, Admin. & Support           -6.331e-04  8.494e-04  -0.745
## NAICS_18Educational Services                   -3.404e-03  9.488e-04  -3.588
## NAICS_18Health Care & Social Assistance         6.959e-04  9.345e-04   0.745
## NAICS_18Information, Culture & Recreation      -2.467e-03  8.704e-04  -2.834
## NAICS_18Accommodation & Food Services           2.421e-03  8.744e-04   2.769
## NAICS_18Other Services                         -3.382e-03  8.938e-04  -3.784
## NAICS_18Public Administration                  -5.901e-03  9.099e-04  -6.485
## AGE_12.L                                        3.408e-04  6.270e-04   0.544
## AGE_12.Q                                        7.586e-03  5.637e-04  13.458
## AGE_12.C                                        1.305e-03  4.602e-04   2.837
## AGE_12^4                                        5.060e-04  4.104e-04   1.233
## AGE_12^5                                        1.659e-03  3.768e-04   4.401
## AGE_12^6                                       -1.540e-04  3.455e-04  -0.446
## AGE_12^7                                        9.278e-04  3.125e-04   2.969
## AGE_12^8                                        3.330e-04  2.822e-04   1.180
## AGE_12^9                                        5.225e-05  2.607e-04   0.200
## AGE_12^10                                      -6.960e-05  2.506e-04  -0.278
## AGE_12^11                                      -1.912e-04  2.486e-04  -0.769
## PROVPEI                                         3.531e-03  6.786e-04   5.204
## PROVNS                                          2.369e-03  5.852e-04   4.049
## PROVNB                                          3.296e-03  5.856e-04   5.629
## PROVQC                                         -1.536e-04  5.102e-04  -0.301
## PROVON                                         -2.433e-03  4.886e-04  -4.979
## PROVMB                                         -9.361e-05  5.401e-04  -0.173
## PROVSK                                         -2.849e-03  5.482e-04  -5.197
## PROVAB                                         -7.269e-03  5.182e-04 -14.027
## PROVBC                                         -4.697e-03  5.343e-04  -8.791
## EDUC.L                                         -7.634e-03  4.281e-04 -17.835
## EDUC.Q                                         -1.312e-03  3.954e-04  -3.319
## EDUC.C                                         -3.132e-04  3.116e-04  -1.005
## EDUC^4                                          4.845e-04  2.995e-04   1.618
## EDUC^5                                         -1.211e-04  2.081e-04  -0.582
## EDUC^6                                         -2.353e-04  2.676e-04  -0.879
## TENURE                                         -6.340e-03  2.888e-04 -21.953
## FIRMSIZE.L                                     -2.470e-03  2.301e-04 -10.733
## FIRMSIZE.Q                                      3.817e-04  2.132e-04   1.790
## FIRMSIZE.C                                     -3.347e-04  2.158e-04  -1.551
## MARSTATCommon-law                               2.673e-04  2.521e-04   1.060
## MARSTATWidowed                                  2.198e-03  1.196e-03   1.838
## MARSTATSeparated                               -4.209e-04  6.667e-04  -0.631
## MARSTATDivorced                                -9.903e-04  5.780e-04  -1.713
## MARSTATSingle, NM                               1.708e-03  4.143e-04   4.123
## ESTSIZE.L                                      -2.402e-03  2.465e-04  -9.745
## ESTSIZE.Q                                      -8.068e-04  2.041e-04  -3.953
## ESTSIZE.C                                       2.603e-04  1.814e-04   1.435
## FTPTMAINPart-time                               5.357e-03  4.205e-04  12.739
## UTOTHRS                                         5.095e-03  1.144e-03   4.453
## PERMTEMPTemp. season                            2.886e-03  4.069e-04   7.093
## PERMTEMPTemp. contract                          7.424e-04  3.657e-04   2.030
## PERMTEMPTemp. casual                            2.263e-03  5.098e-04   4.439
## EFAMTYPEHWDENC                                 -7.464e-04  4.361e-04  -1.711
## EFAMTYPEHWDE17                                  7.805e-04  5.812e-04   1.343
## EFAMTYPEHWDE24                                  1.253e-03  5.458e-04   2.296
## EFAMTYPEHWSHNC                                 -1.097e-03  5.005e-04  -2.191
## EFAMTYPEHWSH17                                  9.804e-04  6.578e-04   1.491
## EFAMTYPEHWSH24                                  4.950e-04  8.669e-04   0.571
## EFAMTYPEHWSWNC                                  1.694e-03  1.202e-03   1.409
## EFAMTYPEHWSW17                                  1.462e-03  1.909e-03   0.766
## EFAMTYPEHWSW24                                 -5.552e-04  1.498e-03  -0.371
## EFAMTYPEHWNENC                                  2.464e-03  9.107e-04   2.706
## EFAMTYPEHWNE17                                  7.108e-03  2.649e-03   2.683
## EFAMTYPEHWNE24                                 -1.888e-04  2.226e-03  -0.085
## EFAMTYPESPE17                                  -1.586e-04  6.982e-04  -0.227
## EFAMTYPESPE24                                   1.493e-03  7.601e-04   1.964
## EFAMTYPESPN17                                  -2.651e-04  1.608e-03  -0.165
## EFAMTYPESPN24                                   2.183e-03  1.858e-03   1.175
## EFAMTYPEOther                                   1.731e-03  3.797e-04   4.558
## SCHOOLNFull-time student                        8.401e-04  5.227e-04   1.607
## SCHOOLNPart-time student                        3.037e-03  6.862e-04   4.426
## SCHOOLNUnknown                                         NA         NA      NA
## CMAToronto                                      8.376e-04  5.958e-04   1.406
## CMAVancouver                                    2.164e-03  6.783e-04   3.191
## CMAOther                                        5.702e-04  4.614e-04   1.236
## UNIONNot member but covered                    -7.646e-04  6.058e-04  -1.262
## UNIONNon-unionized                              1.915e-03  2.256e-04   8.486
## AGYOWNK.L                                       1.261e-03  5.528e-04   2.281
## AGYOWNK.Q                                       1.272e-03  3.761e-04   3.382
## AGYOWNK.C                                       4.742e-04  4.217e-04   1.124
## AGYOWNK^4                                       1.671e-04  3.976e-04   0.420
## MJHMultiple jobholder                           7.441e-04  4.248e-04   1.752
## COWMAINPrivate sector                           3.337e-04  3.968e-04   0.841
##                                                Pr(>|t|)    
## (Intercept)                                     < 2e-16 ***
## NOC_10Business, finance & administration        < 2e-16 ***
## NOC_10Natural & applied sciences                < 2e-16 ***
## NOC_10Health                                    < 2e-16 ***
## NOC_10Educ., law, community & gov. services     < 2e-16 ***
## NOC_10Art, culture, recreation & sport          < 2e-16 ***
## NOC_10Sales & service                           < 2e-16 ***
## NOC_10Trades, transport & equipm. operators     < 2e-16 ***
## NOC_10Natural resources & agriculture           < 2e-16 ***
## NOC_10Manufacturing & utilities                 < 2e-16 ***
## NAICS_18Forestry, Fishing, Min., Oil & Gas      < 2e-16 ***
## NAICS_18Utilities                               < 2e-16 ***
## NAICS_18Construction                            < 2e-16 ***
## NAICS_18Manufacturing durables                 6.51e-07 ***
## NAICS_18Manufacturing non-durables             0.000130 ***
## NAICS_18Wholesale Trade                        3.02e-12 ***
## NAICS_18Retail Trade                           0.203213    
## NAICS_18Transportation & Warehousing           0.001888 ** 
## NAICS_18Finance, Insurance, Real Est. & Leas.  7.07e-12 ***
## NAICS_18Prof., Scientific & Technical Services  < 2e-16 ***
## NAICS_18Management, Admin. & Support           0.456062    
## NAICS_18Educational Services                   0.000335 ***
## NAICS_18Health Care & Social Assistance        0.456521    
## NAICS_18Information, Culture & Recreation      0.004600 ** 
## NAICS_18Accommodation & Food Services          0.005627 ** 
## NAICS_18Other Services                         0.000155 ***
## NAICS_18Public Administration                  9.08e-11 ***
## AGE_12.L                                       0.586753    
## AGE_12.Q                                        < 2e-16 ***
## AGE_12.C                                       0.004563 ** 
## AGE_12^4                                       0.217689    
## AGE_12^5                                       1.08e-05 ***
## AGE_12^6                                       0.655704    
## AGE_12^7                                       0.002991 ** 
## AGE_12^8                                       0.237998    
## AGE_12^9                                       0.841170    
## AGE_12^10                                      0.781179    
## AGE_12^11                                      0.441900    
## PROVPEI                                        1.98e-07 ***
## PROVNS                                         5.17e-05 ***
## PROVNB                                         1.84e-08 ***
## PROVQC                                         0.763376    
## PROVON                                         6.45e-07 ***
## PROVMB                                         0.862393    
## PROVSK                                         2.05e-07 ***
## PROVAB                                          < 2e-16 ***
## PROVBC                                          < 2e-16 ***
## EDUC.L                                          < 2e-16 ***
## EDUC.Q                                         0.000905 ***
## EDUC.C                                         0.314844    
## EDUC^4                                         0.105728    
## EDUC^5                                         0.560434    
## EDUC^6                                         0.379268    
## TENURE                                          < 2e-16 ***
## FIRMSIZE.L                                      < 2e-16 ***
## FIRMSIZE.Q                                     0.073443 .  
## FIRMSIZE.C                                     0.120931    
## MARSTATCommon-law                              0.288969    
## MARSTATWidowed                                 0.066055 .  
## MARSTATSeparated                               0.527849    
## MARSTATDivorced                                0.086697 .  
## MARSTATSingle, NM                              3.76e-05 ***
## ESTSIZE.L                                       < 2e-16 ***
## ESTSIZE.Q                                      7.75e-05 ***
## ESTSIZE.C                                      0.151296    
## FTPTMAINPart-time                               < 2e-16 ***
## UTOTHRS                                        8.51e-06 ***
## PERMTEMPTemp. season                           1.36e-12 ***
## PERMTEMPTemp. contract                         0.042358 *  
## PERMTEMPTemp. casual                           9.10e-06 ***
## EFAMTYPEHWDENC                                 0.087013 .  
## EFAMTYPEHWDE17                                 0.179368    
## EFAMTYPEHWDE24                                 0.021695 *  
## EFAMTYPEHWSHNC                                 0.028471 *  
## EFAMTYPEHWSH17                                 0.136104    
## EFAMTYPEHWSH24                                 0.568034    
## EFAMTYPEHWSWNC                                 0.158891    
## EFAMTYPEHWSW17                                 0.443709    
## EFAMTYPEHWSW24                                 0.710996    
## EFAMTYPEHWNENC                                 0.006812 ** 
## EFAMTYPEHWNE17                                 0.007294 ** 
## EFAMTYPEHWNE24                                 0.932427    
## EFAMTYPESPE17                                  0.820329    
## EFAMTYPESPE24                                  0.049558 *  
## EFAMTYPESPN17                                  0.869080    
## EFAMTYPESPN24                                  0.240192    
## EFAMTYPEOther                                  5.19e-06 ***
## SCHOOLNFull-time student                       0.108026    
## SCHOOLNPart-time student                       9.66e-06 ***
## SCHOOLNUnknown                                       NA    
## CMAToronto                                     0.159738    
## CMAVancouver                                   0.001423 ** 
## CMAOther                                       0.216587    
## UNIONNot member but covered                    0.206887    
## UNIONNon-unionized                              < 2e-16 ***
## AGYOWNK.L                                      0.022569 *  
## AGYOWNK.Q                                      0.000721 ***
## AGYOWNK.C                                      0.260905    
## AGYOWNK^4                                      0.674359    
## MJHMultiple jobholder                          0.079817 .  
## COWMAINPrivate sector                          0.400269    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01079 on 17749 degrees of freedom
## Multiple R-squared:  0.552,  Adjusted R-squared:  0.5495 
## F-statistic: 220.9 on 99 and 17749 DF,  p-value: < 2.2e-16
par(mar=c(5.1, 4.1, 4.1, 2.1))
plot(new.model, lwd = 6)

# Residuals Skewness
# Before Transformation:
skewness(model$residuals)
## [1] 1.032538
# After Transformation
skewness(new.model$residuals)
## [1] 0.4713438
# Prediction
prediction <- predict(new.model, interval = "prediction", newdata = test.19male)
## Warning in predict.lm(new.model, interval = "prediction", newdata = test.
## 19male): prediction from a rank-deficient fit may be misleading
# Errors
errors <- prediction[,"fit"] - (test.19male$HRLYEARN^-0.03)
hist(errors)

rmse <- sqrt(sum((errors)^2)/nrow(test.19male))
mae <- (1/nrow(test.19male))*sum(abs(errors))
diff.percent <- 100*(abs(errors)/(test.19male$HRLYEARN^-0.03))
diff.25 <- length(diff.percent[diff.percent<=25])/nrow(test.19male)
paste("RMSE:", rmse)
## [1] "RMSE: 0.0110021049366726"
paste("MAE:", mae)
## [1] "MAE: 0.00830950408353929"
paste("Percentage of cases with less than 25% error:", diff.25*100)
## [1] "Percentage of cases with less than 25% error: 100"
# Females 2009
# Model
full <- lm(HRLYEARN ~ LFSSTAT+PROV+CMA+AGE_12+MARSTAT+EDUC+MJH+COWMAIN
           +NAICS_18+NOC_10+FTPTMAIN+UTOTHRS+TENURE+UNION+PERMTEMP
           +ESTSIZE+FIRMSIZE+SCHOOLN+EFAMTYPE+AGYOWNK, data = train.09fem)
null <- lm(HRLYEARN ~ 1, data = train.09fem)
model <- stepAIC(null, scope=list(lower=null, upper=full), direction= "both", trace=F) # Trace TRUE to show steps of adding and subtracting vars
summary(model)
## 
## Call:
## lm(formula = HRLYEARN ~ NOC_10 + NAICS_18 + EDUC + TENURE + ESTSIZE + 
##     PROV + AGE_12 + COWMAIN + PERMTEMP + FIRMSIZE + FTPTMAIN + 
##     UTOTHRS + UNION + AGYOWNK + SCHOOLN + CMA, data = train.09fem)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.26133 -0.03550 -0.00523  0.02602  0.50701 
## 
## Coefficients: (1 not defined because of singularities)
##                                                  Estimate Std. Error t value
## (Intercept)                                     2.075e-01  9.059e-03  22.904
## NOC_10Business, finance & administration       -7.498e-02  2.193e-03 -34.197
## NOC_10Natural & applied sciences               -3.951e-02  3.490e-03 -11.322
## NOC_10Health                                   -2.123e-02  2.637e-03  -8.052
## NOC_10Educ., law, community & gov. services    -3.279e-02  2.505e-03 -13.090
## NOC_10Art, culture, recreation & sport         -5.623e-02  3.860e-03 -14.569
## NOC_10Sales & service                          -8.616e-02  2.266e-03 -38.024
## NOC_10Trades, transport & equipm. operators    -8.815e-02  4.012e-03 -21.974
## NOC_10Natural resources & agriculture          -7.752e-02  6.905e-03 -11.227
## NOC_10Manufacturing & utilities                -1.015e-01  4.013e-03 -25.282
## NAICS_18Forestry, Fishing, Min., Oil & Gas      7.122e-02  8.168e-03   8.720
## NAICS_18Utilities                               4.498e-02  9.716e-03   4.629
## NAICS_18Construction                            3.144e-02  8.216e-03   3.826
## NAICS_18Manufacturing durables                  3.825e-02  7.882e-03   4.853
## NAICS_18Manufacturing non-durables              1.797e-02  7.747e-03   2.320
## NAICS_18Wholesale Trade                         2.638e-02  7.931e-03   3.326
## NAICS_18Retail Trade                           -5.281e-03  7.328e-03  -0.721
## NAICS_18Transportation & Warehousing            2.109e-02  7.789e-03   2.707
## NAICS_18Finance, Insurance, Real Est. & Leas.   3.000e-02  7.415e-03   4.046
## NAICS_18Prof., Scientific & Technical Services  3.410e-02  7.553e-03   4.515
## NAICS_18Management, Admin. & Support            6.401e-03  7.585e-03   0.844
## NAICS_18Educational Services                    1.823e-02  7.543e-03   2.417
## NAICS_18Health Care & Social Assistance         2.895e-03  7.365e-03   0.393
## NAICS_18Information, Culture & Recreation       1.362e-02  7.517e-03   1.811
## NAICS_18Accommodation & Food Services          -1.083e-03  7.400e-03  -0.146
## NAICS_18Other Services                          1.848e-02  7.527e-03   2.456
## NAICS_18Public Administration                   3.788e-02  7.548e-03   5.019
## EDUC.L                                          6.333e-02  2.601e-03  24.348
## EDUC.Q                                          2.693e-02  2.338e-03  11.517
## EDUC.C                                          7.972e-03  1.864e-03   4.278
## EDUC^4                                         -5.010e-03  1.624e-03  -3.085
## EDUC^5                                         -5.629e-03  1.172e-03  -4.805
## EDUC^6                                         -4.598e-03  1.314e-03  -3.499
## TENURE                                          3.909e-02  1.661e-03  23.541
## ESTSIZE.L                                       2.316e-02  1.351e-03  17.145
## ESTSIZE.Q                                       2.206e-03  1.139e-03   1.937
## ESTSIZE.C                                       8.789e-05  1.033e-03   0.085
## PROVPEI                                        -3.210e-03  3.677e-03  -0.873
## PROVNS                                          3.114e-03  3.183e-03   0.978
## PROVNB                                         -1.525e-03  3.160e-03  -0.483
## PROVQC                                          1.095e-02  2.776e-03   3.946
## PROVON                                          2.325e-02  2.651e-03   8.771
## PROVMB                                          8.519e-03  2.888e-03   2.950
## PROVSK                                          1.961e-02  2.988e-03   6.563
## PROVAB                                          4.030e-02  2.864e-03  14.073
## PROVBC                                          2.689e-02  3.019e-03   8.906
## AGE_12.L                                        8.367e-03  4.642e-03   1.802
## AGE_12.Q                                       -2.385e-02  4.715e-03  -5.058
## AGE_12.C                                       -1.874e-03  4.178e-03  -0.449
## AGE_12^4                                        1.114e-02  3.668e-03   3.037
## AGE_12^5                                       -5.517e-03  3.198e-03  -1.725
## AGE_12^6                                        5.822e-03  2.740e-03   2.125
## AGE_12^7                                       -3.147e-03  2.285e-03  -1.377
## AGE_12^8                                        1.859e-03  1.894e-03   0.982
## AGE_12^9                                        2.688e-04  1.610e-03   0.167
## AGE_12^10                                       5.326e-04  1.426e-03   0.373
## AGE_12^11                                      -2.374e-03  1.308e-03  -1.815
## COWMAINPrivate sector                          -1.836e-02  1.771e-03 -10.369
## PERMTEMPTemp. season                           -1.205e-02  2.889e-03  -4.169
## PERMTEMPTemp. contract                         -1.133e-02  1.908e-03  -5.939
## PERMTEMPTemp. casual                           -1.017e-02  2.313e-03  -4.397
## FIRMSIZE.L                                      6.336e-03  1.202e-03   5.270
## FIRMSIZE.Q                                      1.592e-03  1.193e-03   1.334
## FIRMSIZE.C                                      1.346e-03  1.275e-03   1.056
## FTPTMAINPart-time                              -1.116e-02  1.748e-03  -6.384
## UTOTHRS                                        -3.022e-02  7.155e-03  -4.224
## UNIONNot member but covered                     8.201e-03  3.415e-03   2.401
## UNIONNon-unionized                             -3.812e-03  1.352e-03  -2.819
## AGYOWNK.L                                      -3.545e-03  1.344e-03  -2.638
## AGYOWNK.Q                                       3.083e-03  1.419e-03   2.173
## AGYOWNK.C                                       1.063e-04  1.474e-03   0.072
## AGYOWNK^4                                       1.507e-03  1.471e-03   1.024
## SCHOOLNFull-time student                       -5.050e-03  2.407e-03  -2.098
## SCHOOLNPart-time student                        2.114e-03  2.730e-03   0.774
## SCHOOLNUnknown                                         NA         NA      NA
## CMAToronto                                      3.005e-03  3.287e-03   0.914
## CMAVancouver                                   -1.787e-03  3.734e-03  -0.478
## CMAOther                                       -2.091e-03  2.500e-03  -0.836
##                                                Pr(>|t|)    
## (Intercept)                                     < 2e-16 ***
## NOC_10Business, finance & administration        < 2e-16 ***
## NOC_10Natural & applied sciences                < 2e-16 ***
## NOC_10Health                                   8.63e-16 ***
## NOC_10Educ., law, community & gov. services     < 2e-16 ***
## NOC_10Art, culture, recreation & sport          < 2e-16 ***
## NOC_10Sales & service                           < 2e-16 ***
## NOC_10Trades, transport & equipm. operators     < 2e-16 ***
## NOC_10Natural resources & agriculture           < 2e-16 ***
## NOC_10Manufacturing & utilities                 < 2e-16 ***
## NAICS_18Forestry, Fishing, Min., Oil & Gas      < 2e-16 ***
## NAICS_18Utilities                              3.70e-06 ***
## NAICS_18Construction                           0.000130 ***
## NAICS_18Manufacturing durables                 1.23e-06 ***
## NAICS_18Manufacturing non-durables             0.020365 *  
## NAICS_18Wholesale Trade                        0.000882 ***
## NAICS_18Retail Trade                           0.471159    
## NAICS_18Transportation & Warehousing           0.006793 ** 
## NAICS_18Finance, Insurance, Real Est. & Leas.  5.24e-05 ***
## NAICS_18Prof., Scientific & Technical Services 6.37e-06 ***
## NAICS_18Management, Admin. & Support           0.398750    
## NAICS_18Educational Services                   0.015640 *  
## NAICS_18Health Care & Social Assistance        0.694278    
## NAICS_18Information, Culture & Recreation      0.070103 .  
## NAICS_18Accommodation & Food Services          0.883633    
## NAICS_18Other Services                         0.014072 *  
## NAICS_18Public Administration                  5.24e-07 ***
## EDUC.L                                          < 2e-16 ***
## EDUC.Q                                          < 2e-16 ***
## EDUC.C                                         1.90e-05 ***
## EDUC^4                                         0.002042 ** 
## EDUC^5                                         1.56e-06 ***
## EDUC^6                                         0.000468 ***
## TENURE                                          < 2e-16 ***
## ESTSIZE.L                                       < 2e-16 ***
## ESTSIZE.Q                                      0.052762 .  
## ESTSIZE.C                                      0.932167    
## PROVPEI                                        0.382669    
## PROVNS                                         0.327954    
## PROVNB                                         0.629444    
## PROVQC                                         7.99e-05 ***
## PROVON                                          < 2e-16 ***
## PROVMB                                         0.003186 ** 
## PROVSK                                         5.41e-11 ***
## PROVAB                                          < 2e-16 ***
## PROVBC                                          < 2e-16 ***
## AGE_12.L                                       0.071489 .  
## AGE_12.Q                                       4.27e-07 ***
## AGE_12.C                                       0.653695    
## AGE_12^4                                       0.002393 ** 
## AGE_12^5                                       0.084478 .  
## AGE_12^6                                       0.033605 *  
## AGE_12^7                                       0.168412    
## AGE_12^8                                       0.326353    
## AGE_12^9                                       0.867382    
## AGE_12^10                                      0.708860    
## AGE_12^11                                      0.069595 .  
## COWMAINPrivate sector                           < 2e-16 ***
## PERMTEMPTemp. season                           3.07e-05 ***
## PERMTEMPTemp. contract                         2.92e-09 ***
## PERMTEMPTemp. casual                           1.10e-05 ***
## FIRMSIZE.L                                     1.38e-07 ***
## FIRMSIZE.Q                                     0.182105    
## FIRMSIZE.C                                     0.291076    
## FTPTMAINPart-time                              1.77e-10 ***
## UTOTHRS                                        2.41e-05 ***
## UNIONNot member but covered                    0.016348 *  
## UNIONNon-unionized                             0.004825 ** 
## AGYOWNK.L                                      0.008338 ** 
## AGYOWNK.Q                                      0.029798 *  
## AGYOWNK.C                                      0.942483    
## AGYOWNK^4                                      0.305664    
## SCHOOLNFull-time student                       0.035956 *  
## SCHOOLNPart-time student                       0.438821    
## SCHOOLNUnknown                                       NA    
## CMAToronto                                     0.360587    
## CMAVancouver                                   0.632330    
## CMAOther                                       0.402976    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06268 on 19253 degrees of freedom
## Multiple R-squared:  0.5226, Adjusted R-squared:  0.5207 
## F-statistic: 277.3 on 76 and 19253 DF,  p-value: < 2.2e-16
# Diagnostic Plots
# par(mfrow = c(2, 2))
par(mar=c(5.1, 4.1, 4.1, 2.1))
plot(model, lwd = 6)

# Box Cox Transformation (Selected attributes from Stepwise Regression)
bc.model <- lm(formula = HRLYEARN ~ NOC_10 + NAICS_18 + EDUC + TENURE + ESTSIZE + 
                 PROV + AGE_12 + COWMAIN + PERMTEMP + FIRMSIZE + FTPTMAIN + 
                 UTOTHRS + UNION + AGYOWNK + SCHOOLN + CMA
               , data = train.09fem)
par(mar=c(15,5,2,1))
bc <- boxcox(bc.model, lambda = seq(-3, 3))

bc$x[which(bc$y==max(bc$y))]
## [1] -0.1515152
new.model <- lm(formula = (HRLYEARN)^-0.15 ~ NOC_10 + NAICS_18 + AGE_12 + PROV + EDUC + 
                  TENURE + ESTSIZE + AGYOWNK + FIRMSIZE + PERMTEMP + FTPTMAIN + 
                  UTOTHRS + MARSTAT + UNION + LFSSTAT + MJH, data = train.09fem)
summary(new.model)
## 
## Call:
## lm(formula = (HRLYEARN)^-0.15 ~ NOC_10 + NAICS_18 + AGE_12 + 
##     PROV + EDUC + TENURE + ESTSIZE + AGYOWNK + FIRMSIZE + PERMTEMP + 
##     FTPTMAIN + UTOTHRS + MARSTAT + UNION + LFSSTAT + MJH, data = train.09fem)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.31644 -0.04301 -0.00080  0.04104  0.74188 
## 
## Coefficients:
##                                                  Estimate Std. Error t value
## (Intercept)                                     1.3541073  0.0098807 137.046
## NOC_10Business, finance & administration        0.0653838  0.0024968  26.187
## NOC_10Natural & applied sciences                0.0335933  0.0039718   8.458
## NOC_10Health                                    0.0159336  0.0029972   5.316
## NOC_10Educ., law, community & gov. services     0.0313051  0.0028518  10.977
## NOC_10Art, culture, recreation & sport          0.0488085  0.0043996  11.094
## NOC_10Sales & service                           0.0920522  0.0025807  35.670
## NOC_10Trades, transport & equipm. operators     0.0884843  0.0045658  19.380
## NOC_10Natural resources & agriculture           0.0730422  0.0078595   9.294
## NOC_10Manufacturing & utilities                 0.1105264  0.0045683  24.194
## NAICS_18Forestry, Fishing, Min., Oil & Gas     -0.0909705  0.0092997  -9.782
## NAICS_18Utilities                              -0.0805083  0.0110075  -7.314
## NAICS_18Construction                           -0.0624766  0.0093523  -6.680
## NAICS_18Manufacturing durables                 -0.0693666  0.0089749  -7.729
## NAICS_18Manufacturing non-durables             -0.0402018  0.0088197  -4.558
## NAICS_18Wholesale Trade                        -0.0546718  0.0090263  -6.057
## NAICS_18Retail Trade                           -0.0021852  0.0083444  -0.262
## NAICS_18Transportation & Warehousing           -0.0492761  0.0088572  -5.563
## NAICS_18Finance, Insurance, Real Est. & Leas.  -0.0585363  0.0084397  -6.936
## NAICS_18Prof., Scientific & Technical Services -0.0630297  0.0085962  -7.332
## NAICS_18Management, Admin. & Support           -0.0182464  0.0086363  -2.113
## NAICS_18Educational Services                   -0.0579554  0.0084723  -6.841
## NAICS_18Health Care & Social Assistance        -0.0365374  0.0083553  -4.373
## NAICS_18Information, Culture & Recreation      -0.0384057  0.0085543  -4.490
## NAICS_18Accommodation & Food Services          -0.0055077  0.0084281  -0.653
## NAICS_18Other Services                         -0.0395860  0.0085700  -4.619
## NAICS_18Public Administration                  -0.0779594  0.0084541  -9.222
## AGE_12.L                                       -0.0126098  0.0054948  -2.295
## AGE_12.Q                                        0.0483309  0.0054258   8.908
## AGE_12.C                                       -0.0122620  0.0047651  -2.573
## AGE_12^4                                        0.0018906  0.0041823   0.452
## AGE_12^5                                        0.0068987  0.0036494   1.890
## AGE_12^6                                       -0.0073045  0.0031228  -2.339
## AGE_12^7                                        0.0064846  0.0026029   2.491
## AGE_12^8                                       -0.0035263  0.0021565  -1.635
## AGE_12^9                                       -0.0006300  0.0018328  -0.344
## AGE_12^10                                      -0.0007318  0.0016236  -0.451
## AGE_12^11                                       0.0017424  0.0014896   1.170
## PROVPEI                                         0.0010888  0.0041870   0.260
## PROVNS                                         -0.0051395  0.0036245  -1.418
## PROVNB                                          0.0003744  0.0035984   0.104
## PROVQC                                         -0.0209276  0.0031011  -6.749
## PROVON                                         -0.0333490  0.0029829 -11.180
## PROVMB                                         -0.0182755  0.0032895  -5.556
## PROVSK                                         -0.0339569  0.0034036  -9.977
## PROVAB                                         -0.0603071  0.0032609 -18.494
## PROVBC                                         -0.0410203  0.0032133 -12.766
## EDUC.L                                         -0.0686123  0.0029564 -23.208
## EDUC.Q                                         -0.0140310  0.0026620  -5.271
## EDUC.C                                         -0.0056037  0.0021219  -2.641
## EDUC^4                                          0.0046728  0.0018482   2.528
## EDUC^5                                          0.0071201  0.0013327   5.343
## EDUC^6                                          0.0029028  0.0014854   1.954
## TENURE                                         -0.0495809  0.0018875 -26.269
## ESTSIZE.L                                      -0.0251087  0.0015274 -16.439
## ESTSIZE.Q                                      -0.0006827  0.0012910  -0.529
## ESTSIZE.C                                       0.0002798  0.0011746   0.238
## AGYOWNK.L                                       0.0017249  0.0015918   1.084
## AGYOWNK.Q                                      -0.0024575  0.0016280  -1.510
## AGYOWNK.C                                       0.0002202  0.0016845   0.131
## AGYOWNK^4                                      -0.0028836  0.0016757  -1.721
## FIRMSIZE.L                                     -0.0108208  0.0013576  -7.971
## FIRMSIZE.Q                                      0.0008824  0.0013584   0.650
## FIRMSIZE.C                                     -0.0021363  0.0014512  -1.472
## PERMTEMPTemp. season                            0.0178452  0.0032867   5.430
## PERMTEMPTemp. contract                          0.0089444  0.0021669   4.128
## PERMTEMPTemp. casual                            0.0115115  0.0026312   4.375
## FTPTMAINPart-time                               0.0186885  0.0021900   8.533
## UTOTHRS                                         0.0203361  0.0090995   2.235
## MARSTATCommon-law                               0.0010521  0.0016940   0.621
## MARSTATWidowed                                  0.0092272  0.0041691   2.213
## MARSTATSeparated                                0.0012285  0.0028667   0.429
## MARSTATDivorced                                 0.0002745  0.0022475   0.122
## MARSTATSingle, NM                               0.0035107  0.0016846   2.084
## UNIONNot member but covered                    -0.0001607  0.0038790  -0.041
## UNIONNon-unionized                              0.0207842  0.0014746  14.094
## LFSSTATEmployed, absent from work               0.0021582  0.0018875   1.143
## MJHMultiple jobholder                           0.0002582  0.0024231   0.107
##                                                Pr(>|t|)    
## (Intercept)                                     < 2e-16 ***
## NOC_10Business, finance & administration        < 2e-16 ***
## NOC_10Natural & applied sciences                < 2e-16 ***
## NOC_10Health                                   1.07e-07 ***
## NOC_10Educ., law, community & gov. services     < 2e-16 ***
## NOC_10Art, culture, recreation & sport          < 2e-16 ***
## NOC_10Sales & service                           < 2e-16 ***
## NOC_10Trades, transport & equipm. operators     < 2e-16 ***
## NOC_10Natural resources & agriculture           < 2e-16 ***
## NOC_10Manufacturing & utilities                 < 2e-16 ***
## NAICS_18Forestry, Fishing, Min., Oil & Gas      < 2e-16 ***
## NAICS_18Utilities                              2.70e-13 ***
## NAICS_18Construction                           2.45e-11 ***
## NAICS_18Manufacturing durables                 1.14e-14 ***
## NAICS_18Manufacturing non-durables             5.19e-06 ***
## NAICS_18Wholesale Trade                        1.41e-09 ***
## NAICS_18Retail Trade                            0.79342    
## NAICS_18Transportation & Warehousing           2.68e-08 ***
## NAICS_18Finance, Insurance, Real Est. & Leas.  4.17e-12 ***
## NAICS_18Prof., Scientific & Technical Services 2.35e-13 ***
## NAICS_18Management, Admin. & Support            0.03463 *  
## NAICS_18Educational Services                   8.12e-12 ***
## NAICS_18Health Care & Social Assistance        1.23e-05 ***
## NAICS_18Information, Culture & Recreation      7.18e-06 ***
## NAICS_18Accommodation & Food Services           0.51345    
## NAICS_18Other Services                         3.88e-06 ***
## NAICS_18Public Administration                   < 2e-16 ***
## AGE_12.L                                        0.02175 *  
## AGE_12.Q                                        < 2e-16 ***
## AGE_12.C                                        0.01008 *  
## AGE_12^4                                        0.65124    
## AGE_12^5                                        0.05873 .  
## AGE_12^6                                        0.01934 *  
## AGE_12^7                                        0.01274 *  
## AGE_12^8                                        0.10203    
## AGE_12^9                                        0.73106    
## AGE_12^10                                       0.65218    
## AGE_12^11                                       0.24210    
## PROVPEI                                         0.79482    
## PROVNS                                          0.15620    
## PROVNB                                          0.91714    
## PROVQC                                         1.54e-11 ***
## PROVON                                          < 2e-16 ***
## PROVMB                                         2.80e-08 ***
## PROVSK                                          < 2e-16 ***
## PROVAB                                          < 2e-16 ***
## PROVBC                                          < 2e-16 ***
## EDUC.L                                          < 2e-16 ***
## EDUC.Q                                         1.37e-07 ***
## EDUC.C                                          0.00828 ** 
## EDUC^4                                          0.01147 *  
## EDUC^5                                         9.26e-08 ***
## EDUC^6                                          0.05069 .  
## TENURE                                          < 2e-16 ***
## ESTSIZE.L                                       < 2e-16 ***
## ESTSIZE.Q                                       0.59692    
## ESTSIZE.C                                       0.81171    
## AGYOWNK.L                                       0.27855    
## AGYOWNK.Q                                       0.13118    
## AGYOWNK.C                                       0.89598    
## AGYOWNK^4                                       0.08529 .  
## FIRMSIZE.L                                     1.66e-15 ***
## FIRMSIZE.Q                                      0.51598    
## FIRMSIZE.C                                      0.14102    
## PERMTEMPTemp. season                           5.72e-08 ***
## PERMTEMPTemp. contract                         3.68e-05 ***
## PERMTEMPTemp. casual                           1.22e-05 ***
## FTPTMAINPart-time                               < 2e-16 ***
## UTOTHRS                                         0.02544 *  
## MARSTATCommon-law                               0.53455    
## MARSTATWidowed                                  0.02689 *  
## MARSTATSeparated                                0.66826    
## MARSTATDivorced                                 0.90279    
## MARSTATSingle, NM                               0.03718 *  
## UNIONNot member but covered                     0.96695    
## UNIONNon-unionized                              < 2e-16 ***
## LFSSTATEmployed, absent from work               0.25286    
## MJHMultiple jobholder                           0.91514    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07137 on 19252 degrees of freedom
## Multiple R-squared:  0.5881, Adjusted R-squared:  0.5864 
## F-statistic: 356.9 on 77 and 19252 DF,  p-value: < 2.2e-16
par(mar=c(5.1, 4.1, 4.1, 2.1))
plot(new.model, lwd = 6)

# Residuals Skewness
# Before Transformation:
skewness(model$residuals)
## [1] 1.181981
# After Transformation
skewness(new.model$residuals)
## [1] 0.5286001
# Test Set Correction!!!
# There is one observation in Test set for Females 2009 where HRLYEARN (hourly wages) is equal to zero.
# This case cannot be analyzed later on to calculate RMSE and MAE.
# It will be excluded from the Test Set.
test.09fem <- test.09fem[-4518,]
# Prediction
prediction <- predict(new.model, interval = "prediction", newdata = test.09fem)
# Errors
errors <- prediction[,"fit"] - (test.09fem$HRLYEARN^(-0.15))
hist(errors)

rmse <- sqrt(sum((errors)^2)/nrow(test.09fem))
mae <- (1/nrow(test.09fem))*sum(abs(errors))
diff.percent <- 100*(abs(errors)/(test.09fem$HRLYEARN^-0.15))
diff.25 <- length(diff.percent[diff.percent<=25])/nrow(test.09fem)
paste("RMSE:", rmse)
## [1] "RMSE: 0.0727054261959815"
paste("MAE:", mae)
## [1] "MAE: 0.0534999014295685"
paste("Percentage of cases with less than 25% error:", diff.25*100)
## [1] "Percentage of cases with less than 25% error: 99.8430709802028"
# Females 2019
# Model
full <- lm(HRLYEARN ~ LFSSTAT+PROV+CMA+AGE_12+MARSTAT+EDUC+MJH+COWMAIN
           +NAICS_18+NOC_10+FTPTMAIN+UTOTHRS+TENURE+UNION+PERMTEMP
           +ESTSIZE+FIRMSIZE+SCHOOLN+EFAMTYPE+AGYOWNK, data = train.19fem)
null <- lm(HRLYEARN ~ 1, data = train.19fem)
model <- stepAIC(null, scope=list(lower=null, upper=full), direction= "both", trace=F) # Trace TRUE to show steps of adding and subtracting vars
summary(model)
## 
## Call:
## lm(formula = HRLYEARN ~ NOC_10 + EDUC + TENURE + NAICS_18 + PROV + 
##     ESTSIZE + AGE_12 + COWMAIN + PERMTEMP + FIRMSIZE + FTPTMAIN + 
##     UTOTHRS + EFAMTYPE + LFSSTAT + SCHOOLN + MARSTAT, data = train.19fem)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.35100 -0.04429 -0.00725  0.03222  0.70051 
## 
## Coefficients: (1 not defined because of singularities)
##                                                  Estimate Std. Error t value
## (Intercept)                                     2.903e-01  1.195e-02  24.297
## NOC_10Business, finance & administration       -1.021e-01  2.950e-03 -34.631
## NOC_10Natural & applied sciences               -4.995e-02  4.288e-03 -11.651
## NOC_10Health                                   -5.306e-02  3.412e-03 -15.548
## NOC_10Educ., law, community & gov. services    -7.682e-02  3.199e-03 -24.010
## NOC_10Art, culture, recreation & sport         -1.055e-01  5.175e-03 -20.385
## NOC_10Sales & service                          -1.275e-01  3.084e-03 -41.359
## NOC_10Trades, transport & equipm. operators    -1.081e-01  5.059e-03 -21.361
## NOC_10Natural resources & agriculture          -9.893e-02  9.121e-03 -10.847
## NOC_10Manufacturing & utilities                -1.363e-01  5.493e-03 -24.807
## EDUC.L                                          8.306e-02  3.977e-03  20.886
## EDUC.Q                                          3.829e-02  3.681e-03  10.401
## EDUC.C                                          2.056e-02  2.912e-03   7.060
## EDUC^4                                         -6.162e-03  2.458e-03  -2.507
## EDUC^5                                         -5.867e-03  1.673e-03  -3.506
## EDUC^6                                         -3.233e-03  1.929e-03  -1.676
## TENURE                                          5.823e-02  2.113e-03  27.557
## NAICS_18Forestry, Fishing, Min., Oil & Gas      1.154e-01  1.045e-02  11.045
## NAICS_18Utilities                               1.092e-01  1.323e-02   8.257
## NAICS_18Construction                            5.067e-02  1.092e-02   4.639
## NAICS_18Manufacturing durables                  4.871e-02  1.101e-02   4.425
## NAICS_18Manufacturing non-durables              3.603e-02  1.082e-02   3.330
## NAICS_18Wholesale Trade                         5.322e-02  1.096e-02   4.854
## NAICS_18Retail Trade                            1.372e-02  1.027e-02   1.336
## NAICS_18Transportation & Warehousing            3.342e-02  1.075e-02   3.109
## NAICS_18Finance, Insurance, Real Est. & Leas.   6.138e-02  1.037e-02   5.917
## NAICS_18Prof., Scientific & Technical Services  5.533e-02  1.046e-02   5.291
## NAICS_18Management, Admin. & Support            2.594e-02  1.051e-02   2.469
## NAICS_18Educational Services                    3.497e-02  1.050e-02   3.331
## NAICS_18Health Care & Social Assistance         1.447e-02  1.028e-02   1.408
## NAICS_18Information, Culture & Recreation       2.704e-02  1.060e-02   2.551
## NAICS_18Accommodation & Food Services           2.081e-02  1.037e-02   2.007
## NAICS_18Other Services                          3.217e-02  1.055e-02   3.049
## NAICS_18Public Administration                   5.522e-02  1.047e-02   5.276
## PROVPEI                                        -1.769e-02  4.654e-03  -3.802
## PROVNS                                         -2.628e-02  4.078e-03  -6.444
## PROVNB                                         -2.415e-02  4.151e-03  -5.817
## PROVQC                                         -4.173e-03  3.547e-03  -1.177
## PROVON                                          8.836e-03  3.420e-03   2.584
## PROVMB                                         -9.988e-03  3.816e-03  -2.617
## PROVSK                                          4.154e-03  3.878e-03   1.071
## PROVAB                                          2.437e-02  3.722e-03   6.547
## PROVBC                                          7.478e-03  3.630e-03   2.060
## ESTSIZE.L                                       1.873e-02  1.695e-03  11.049
## ESTSIZE.Q                                       3.767e-03  1.412e-03   2.667
## ESTSIZE.C                                      -4.077e-04  1.312e-03  -0.311
## AGE_12.L                                       -1.168e-02  4.818e-03  -2.424
## AGE_12.Q                                       -2.766e-02  4.239e-03  -6.525
## AGE_12.C                                       -2.093e-02  3.609e-03  -5.800
## AGE_12^4                                        7.594e-03  3.235e-03   2.348
## AGE_12^5                                       -7.689e-03  2.925e-03  -2.629
## AGE_12^6                                       -2.822e-03  2.602e-03  -1.085
## AGE_12^7                                       -2.552e-03  2.310e-03  -1.105
## AGE_12^8                                       -2.633e-03  2.062e-03  -1.277
## AGE_12^9                                        1.095e-03  1.873e-03   0.585
## AGE_12^10                                       2.825e-04  1.766e-03   0.160
## AGE_12^11                                       1.232e-03  1.744e-03   0.707
## COWMAINPrivate sector                          -2.868e-02  2.132e-03 -13.454
## PERMTEMPTemp. season                           -1.707e-02  3.909e-03  -4.366
## PERMTEMPTemp. contract                         -1.544e-02  2.416e-03  -6.391
## PERMTEMPTemp. casual                           -1.862e-02  2.898e-03  -6.425
## FIRMSIZE.L                                      9.263e-03  1.594e-03   5.812
## FIRMSIZE.Q                                      1.981e-03  1.510e-03   1.312
## FIRMSIZE.C                                     -3.033e-04  1.596e-03  -0.190
## FTPTMAINPart-time                              -1.672e-02  2.232e-03  -7.491
## UTOTHRS                                        -4.984e-02  8.860e-03  -5.625
## EFAMTYPEHWDENC                                 -4.107e-03  2.996e-03  -1.371
## EFAMTYPEHWDE17                                  1.424e-04  2.964e-03   0.048
## EFAMTYPEHWDE24                                 -5.367e-03  3.232e-03  -1.661
## EFAMTYPEHWSHNC                                 -4.386e-03  8.755e-03  -0.501
## EFAMTYPEHWSH17                                 -5.218e-03  7.655e-03  -0.682
## EFAMTYPEHWSH24                                  1.503e-02  1.028e-02   1.463
## EFAMTYPEHWSWNC                                 -7.142e-03  3.678e-03  -1.942
## EFAMTYPEHWSW17                                 -8.043e-03  5.055e-03  -1.591
## EFAMTYPEHWSW24                                 -1.498e-02  6.834e-03  -2.192
## EFAMTYPEHWNENC                                 -1.679e-02  7.322e-03  -2.293
## EFAMTYPEHWNE17                                 -1.304e-02  1.890e-02  -0.690
## EFAMTYPEHWNE24                                 -6.159e-03  1.700e-02  -0.362
## EFAMTYPESPE17                                  -3.546e-03  2.805e-03  -1.264
## EFAMTYPESPE24                                  -1.110e-02  4.316e-03  -2.571
## EFAMTYPESPN17                                  -3.399e-03  1.344e-02  -0.253
## EFAMTYPESPN24                                  -1.034e-02  1.361e-02  -0.760
## EFAMTYPEOther                                  -9.108e-03  2.723e-03  -3.344
## LFSSTATEmployed, absent from work              -5.284e-03  2.060e-03  -2.565
## SCHOOLNFull-time student                       -6.879e-03  3.138e-03  -2.192
## SCHOOLNPart-time student                        1.235e-03  3.696e-03   0.334
## SCHOOLNUnknown                                         NA         NA      NA
## MARSTATCommon-law                               4.295e-05  1.800e-03   0.024
## MARSTATWidowed                                 -7.476e-03  5.341e-03  -1.400
## MARSTATSeparated                                6.652e-03  4.088e-03   1.627
## MARSTATDivorced                                -2.637e-03  3.547e-03  -0.744
## MARSTATSingle, NM                              -3.683e-03  2.711e-03  -1.359
##                                                Pr(>|t|)    
## (Intercept)                                     < 2e-16 ***
## NOC_10Business, finance & administration        < 2e-16 ***
## NOC_10Natural & applied sciences                < 2e-16 ***
## NOC_10Health                                    < 2e-16 ***
## NOC_10Educ., law, community & gov. services     < 2e-16 ***
## NOC_10Art, culture, recreation & sport          < 2e-16 ***
## NOC_10Sales & service                           < 2e-16 ***
## NOC_10Trades, transport & equipm. operators     < 2e-16 ***
## NOC_10Natural resources & agriculture           < 2e-16 ***
## NOC_10Manufacturing & utilities                 < 2e-16 ***
## EDUC.L                                          < 2e-16 ***
## EDUC.Q                                          < 2e-16 ***
## EDUC.C                                         1.73e-12 ***
## EDUC^4                                         0.012183 *  
## EDUC^5                                         0.000456 ***
## EDUC^6                                         0.093717 .  
## TENURE                                          < 2e-16 ***
## NAICS_18Forestry, Fishing, Min., Oil & Gas      < 2e-16 ***
## NAICS_18Utilities                               < 2e-16 ***
## NAICS_18Construction                           3.53e-06 ***
## NAICS_18Manufacturing durables                 9.69e-06 ***
## NAICS_18Manufacturing non-durables             0.000872 ***
## NAICS_18Wholesale Trade                        1.22e-06 ***
## NAICS_18Retail Trade                           0.181606    
## NAICS_18Transportation & Warehousing           0.001883 ** 
## NAICS_18Finance, Insurance, Real Est. & Leas.  3.34e-09 ***
## NAICS_18Prof., Scientific & Technical Services 1.23e-07 ***
## NAICS_18Management, Admin. & Support           0.013565 *  
## NAICS_18Educational Services                   0.000866 ***
## NAICS_18Health Care & Social Assistance        0.159135    
## NAICS_18Information, Culture & Recreation      0.010765 *  
## NAICS_18Accommodation & Food Services          0.044802 *  
## NAICS_18Other Services                         0.002297 ** 
## NAICS_18Public Administration                  1.33e-07 ***
## PROVPEI                                        0.000144 ***
## PROVNS                                         1.20e-10 ***
## PROVNB                                         6.10e-09 ***
## PROVQC                                         0.239312    
## PROVON                                         0.009780 ** 
## PROVMB                                         0.008868 ** 
## PROVSK                                         0.284191    
## PROVAB                                         6.03e-11 ***
## PROVBC                                         0.039414 *  
## ESTSIZE.L                                       < 2e-16 ***
## ESTSIZE.Q                                      0.007653 ** 
## ESTSIZE.C                                      0.755955    
## AGE_12.L                                       0.015364 *  
## AGE_12.Q                                       6.99e-11 ***
## AGE_12.C                                       6.74e-09 ***
## AGE_12^4                                       0.018903 *  
## AGE_12^5                                       0.008584 ** 
## AGE_12^6                                       0.278109    
## AGE_12^7                                       0.269275    
## AGE_12^8                                       0.201584    
## AGE_12^9                                       0.558889    
## AGE_12^10                                      0.872913    
## AGE_12^11                                      0.479791    
## COWMAINPrivate sector                           < 2e-16 ***
## PERMTEMPTemp. season                           1.27e-05 ***
## PERMTEMPTemp. contract                         1.69e-10 ***
## PERMTEMPTemp. casual                           1.35e-10 ***
## FIRMSIZE.L                                     6.30e-09 ***
## FIRMSIZE.Q                                     0.189563    
## FIRMSIZE.C                                     0.849248    
## FTPTMAINPart-time                              7.15e-14 ***
## UTOTHRS                                        1.88e-08 ***
## EFAMTYPEHWDENC                                 0.170496    
## EFAMTYPEHWDE17                                 0.961690    
## EFAMTYPEHWDE24                                 0.096770 .  
## EFAMTYPEHWSHNC                                 0.616447    
## EFAMTYPEHWSH17                                 0.495480    
## EFAMTYPEHWSH24                                 0.143593    
## EFAMTYPEHWSWNC                                 0.052155 .  
## EFAMTYPEHWSW17                                 0.111591    
## EFAMTYPEHWSW24                                 0.028405 *  
## EFAMTYPEHWNENC                                 0.021864 *  
## EFAMTYPEHWNE17                                 0.490450    
## EFAMTYPEHWNE24                                 0.717066    
## EFAMTYPESPE17                                  0.206294    
## EFAMTYPESPE24                                  0.010151 *  
## EFAMTYPESPN17                                  0.800343    
## EFAMTYPESPN24                                  0.447166    
## EFAMTYPEOther                                  0.000826 ***
## LFSSTATEmployed, absent from work              0.010321 *  
## SCHOOLNFull-time student                       0.028390 *  
## SCHOOLNPart-time student                       0.738281    
## SCHOOLNUnknown                                       NA    
## MARSTATCommon-law                              0.980963    
## MARSTATWidowed                                 0.161598    
## MARSTATSeparated                               0.103728    
## MARSTATDivorced                                0.457165    
## MARSTATSingle, NM                              0.174288    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.077 on 17711 degrees of freedom
## Multiple R-squared:  0.516,  Adjusted R-squared:  0.5135 
## F-statistic: 209.8 on 90 and 17711 DF,  p-value: < 2.2e-16
# Diagnostic Plots
# par(mfrow = c(2, 2))
par(mar=c(5.1, 4.1, 4.1, 2.1))
plot(model, lwd = 6)

# Box Cox Transformation (Selected attributes from Stepwise Regression)
bc.model <- lm(formula = HRLYEARN ~ NOC_10 + EDUC + TENURE + NAICS_18 + PROV + 
                 ESTSIZE + AGE_12 + COWMAIN + PERMTEMP + FIRMSIZE + FTPTMAIN + 
                 UTOTHRS + EFAMTYPE + LFSSTAT + SCHOOLN + MARSTAT
               , data = train.19fem)
par(mar=c(15,5,2,1))
bc <- boxcox(bc.model, lambda = seq(-3, 3))

bc$x[which(bc$y==max(bc$y))]
## [1] -0.1515152
new.model <- lm(formula = (HRLYEARN)^-0.15 ~ NOC_10 + EDUC + TENURE + NAICS_18 + PROV + 
                  ESTSIZE + AGE_12 + COWMAIN + PERMTEMP + FIRMSIZE + FTPTMAIN + 
                  UTOTHRS + EFAMTYPE + LFSSTAT + SCHOOLN + MARSTAT, data = train.19fem)
summary(new.model)
## 
## Call:
## lm(formula = (HRLYEARN)^-0.15 ~ NOC_10 + EDUC + TENURE + NAICS_18 + 
##     PROV + ESTSIZE + AGE_12 + COWMAIN + PERMTEMP + FIRMSIZE + 
##     FTPTMAIN + UTOTHRS + EFAMTYPE + LFSSTAT + SCHOOLN + MARSTAT, 
##     data = train.19fem)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.34768 -0.03654  0.00094  0.03679  0.78197 
## 
## Coefficients: (1 not defined because of singularities)
##                                                  Estimate Std. Error t value
## (Intercept)                                     1.2645149  0.0096986 130.382
## NOC_10Business, finance & administration        0.0632019  0.0023946  26.393
## NOC_10Natural & applied sciences                0.0277562  0.0034809   7.974
## NOC_10Health                                    0.0266160  0.0027704   9.607
## NOC_10Educ., law, community & gov. services     0.0488443  0.0025974  18.805
## NOC_10Art, culture, recreation & sport          0.0663367  0.0042015  15.789
## NOC_10Sales & service                           0.0943841  0.0025035  37.701
## NOC_10Trades, transport & equipm. operators     0.0731511  0.0041075  17.809
## NOC_10Natural resources & agriculture           0.0629183  0.0074044   8.497
## NOC_10Manufacturing & utilities                 0.1068319  0.0044597  23.955
## EDUC.L                                         -0.0657568  0.0032288 -20.366
## EDUC.Q                                         -0.0156022  0.0029886  -5.221
## EDUC.C                                         -0.0143191  0.0023639  -6.057
## EDUC^4                                          0.0081766  0.0019953   4.098
## EDUC^5                                          0.0049324  0.0013584   3.631
## EDUC^6                                          0.0004752  0.0015660   0.303
## TENURE                                         -0.0512488  0.0017153 -29.877
## NAICS_18Forestry, Fishing, Min., Oil & Gas     -0.0989629  0.0084834 -11.665
## NAICS_18Utilities                              -0.0948650  0.0107376  -8.835
## NAICS_18Construction                           -0.0598373  0.0088688  -6.747
## NAICS_18Manufacturing durables                 -0.0649400  0.0089370  -7.266
## NAICS_18Manufacturing non-durables             -0.0492158  0.0087846  -5.603
## NAICS_18Wholesale Trade                        -0.0644516  0.0089000  -7.242
## NAICS_18Retail Trade                           -0.0186889  0.0083402  -2.241
## NAICS_18Transportation & Warehousing           -0.0441720  0.0087272  -5.061
## NAICS_18Finance, Insurance, Real Est. & Leas.  -0.0696109  0.0084226  -8.265
## NAICS_18Prof., Scientific & Technical Services -0.0635029  0.0084891  -7.480
## NAICS_18Management, Admin. & Support           -0.0341059  0.0085290  -3.999
## NAICS_18Educational Services                   -0.0446031  0.0085222  -5.234
## NAICS_18Health Care & Social Assistance        -0.0312260  0.0083429  -3.743
## NAICS_18Information, Culture & Recreation      -0.0364530  0.0086060  -4.236
## NAICS_18Accommodation & Food Services          -0.0228528  0.0084177  -2.715
## NAICS_18Other Services                         -0.0395185  0.0085653  -4.614
## NAICS_18Public Administration                  -0.0600222  0.0084973  -7.064
## PROVPEI                                         0.0082055  0.0037783   2.172
## PROVNS                                          0.0202298  0.0033103   6.111
## PROVNB                                          0.0171803  0.0033700   5.098
## PROVQC                                         -0.0040035  0.0028793  -1.390
## PROVON                                         -0.0146772  0.0027765  -5.286
## PROVMB                                          0.0067135  0.0030981   2.167
## PROVSK                                         -0.0067018  0.0031486  -2.129
## PROVAB                                         -0.0298605  0.0030216  -9.882
## PROVBC                                         -0.0184483  0.0029473  -6.259
## ESTSIZE.L                                      -0.0142272  0.0013763 -10.337
## ESTSIZE.Q                                      -0.0018528  0.0011465  -1.616
## ESTSIZE.C                                       0.0014875  0.0010649   1.397
## AGE_12.L                                        0.0112419  0.0039114   2.874
## AGE_12.Q                                        0.0348341  0.0034411  10.123
## AGE_12.C                                        0.0089578  0.0029301   3.057
## AGE_12^4                                       -0.0002968  0.0026260  -0.113
## AGE_12^5                                        0.0083008  0.0023750   3.495
## AGE_12^6                                        0.0002822  0.0021121   0.134
## AGE_12^7                                        0.0052047  0.0018755   2.775
## AGE_12^8                                        0.0021152  0.0016741   1.264
## AGE_12^9                                       -0.0009318  0.0015209  -0.613
## AGE_12^10                                      -0.0002565  0.0014338  -0.179
## AGE_12^11                                      -0.0010725  0.0014157  -0.758
## COWMAINPrivate sector                           0.0275927  0.0017307  15.943
## PERMTEMPTemp. season                            0.0184655  0.0031736   5.819
## PERMTEMPTemp. contract                          0.0085185  0.0019613   4.343
## PERMTEMPTemp. casual                            0.0160010  0.0023527   6.801
## FIRMSIZE.L                                     -0.0099365  0.0012940  -7.679
## FIRMSIZE.Q                                      0.0006736  0.0012258   0.550
## FIRMSIZE.C                                     -0.0005100  0.0012953  -0.394
## FTPTMAINPart-time                               0.0194216  0.0018118  10.719
## UTOTHRS                                         0.0367328  0.0071932   5.107
## EFAMTYPEHWDENC                                  0.0008833  0.0024325   0.363
## EFAMTYPEHWDE17                                  0.0005895  0.0024060   0.245
## EFAMTYPEHWDE24                                  0.0041420  0.0026235   1.579
## EFAMTYPEHWSHNC                                  0.0085380  0.0071081   1.201
## EFAMTYPEHWSH17                                  0.0102411  0.0062146   1.648
## EFAMTYPEHWSH24                                 -0.0131896  0.0083437  -1.581
## EFAMTYPEHWSWNC                                  0.0048180  0.0029858   1.614
## EFAMTYPEHWSW17                                  0.0106901  0.0041038   2.605
## EFAMTYPEHWSW24                                  0.0143981  0.0055481   2.595
## EFAMTYPEHWNENC                                  0.0131091  0.0059444   2.205
## EFAMTYPEHWNE17                                  0.0185854  0.0153457   1.211
## EFAMTYPEHWNE24                                  0.0038376  0.0137987   0.278
## EFAMTYPESPE17                                   0.0058985  0.0022775   2.590
## EFAMTYPESPE24                                   0.0086710  0.0035037   2.475
## EFAMTYPESPN17                                   0.0055439  0.0109115   0.508
## EFAMTYPESPN24                                   0.0133693  0.0110482   1.210
## EFAMTYPEOther                                   0.0082649  0.0022109   3.738
## LFSSTATEmployed, absent from work               0.0028700  0.0016724   1.716
## SCHOOLNFull-time student                        0.0086524  0.0025476   3.396
## SCHOOLNPart-time student                        0.0011124  0.0030006   0.371
## SCHOOLNUnknown                                         NA         NA      NA
## MARSTATCommon-law                              -0.0008973  0.0014612  -0.614
## MARSTATWidowed                                  0.0029769  0.0043359   0.687
## MARSTATSeparated                               -0.0092692  0.0033188  -2.793
## MARSTATDivorced                                 0.0003429  0.0028796   0.119
## MARSTATSingle, NM                               0.0027031  0.0022009   1.228
##                                                Pr(>|t|)    
## (Intercept)                                     < 2e-16 ***
## NOC_10Business, finance & administration        < 2e-16 ***
## NOC_10Natural & applied sciences               1.63e-15 ***
## NOC_10Health                                    < 2e-16 ***
## NOC_10Educ., law, community & gov. services     < 2e-16 ***
## NOC_10Art, culture, recreation & sport          < 2e-16 ***
## NOC_10Sales & service                           < 2e-16 ***
## NOC_10Trades, transport & equipm. operators     < 2e-16 ***
## NOC_10Natural resources & agriculture           < 2e-16 ***
## NOC_10Manufacturing & utilities                 < 2e-16 ***
## EDUC.L                                          < 2e-16 ***
## EDUC.Q                                         1.80e-07 ***
## EDUC.C                                         1.41e-09 ***
## EDUC^4                                         4.19e-05 ***
## EDUC^5                                         0.000283 ***
## EDUC^6                                         0.761548    
## TENURE                                          < 2e-16 ***
## NAICS_18Forestry, Fishing, Min., Oil & Gas      < 2e-16 ***
## NAICS_18Utilities                               < 2e-16 ***
## NAICS_18Construction                           1.56e-11 ***
## NAICS_18Manufacturing durables                 3.84e-13 ***
## NAICS_18Manufacturing non-durables             2.14e-08 ***
## NAICS_18Wholesale Trade                        4.61e-13 ***
## NAICS_18Retail Trade                           0.025049 *  
## NAICS_18Transportation & Warehousing           4.20e-07 ***
## NAICS_18Finance, Insurance, Real Est. & Leas.   < 2e-16 ***
## NAICS_18Prof., Scientific & Technical Services 7.75e-14 ***
## NAICS_18Management, Admin. & Support           6.39e-05 ***
## NAICS_18Educational Services                   1.68e-07 ***
## NAICS_18Health Care & Social Assistance        0.000183 ***
## NAICS_18Information, Culture & Recreation      2.29e-05 ***
## NAICS_18Accommodation & Food Services          0.006637 ** 
## NAICS_18Other Services                         3.98e-06 ***
## NAICS_18Public Administration                  1.68e-12 ***
## PROVPEI                                        0.029890 *  
## PROVNS                                         1.01e-09 ***
## PROVNB                                         3.47e-07 ***
## PROVQC                                         0.164415    
## PROVON                                         1.26e-07 ***
## PROVMB                                         0.030250 *  
## PROVSK                                         0.033308 *  
## PROVAB                                          < 2e-16 ***
## PROVBC                                         3.95e-10 ***
## ESTSIZE.L                                       < 2e-16 ***
## ESTSIZE.Q                                      0.106111    
## ESTSIZE.C                                      0.162469    
## AGE_12.L                                       0.004056 ** 
## AGE_12.Q                                        < 2e-16 ***
## AGE_12.C                                       0.002238 ** 
## AGE_12^4                                       0.910013    
## AGE_12^5                                       0.000475 ***
## AGE_12^6                                       0.893695    
## AGE_12^7                                       0.005525 ** 
## AGE_12^8                                       0.206423    
## AGE_12^9                                       0.540125    
## AGE_12^10                                      0.858006    
## AGE_12^11                                      0.448722    
## COWMAINPrivate sector                           < 2e-16 ***
## PERMTEMPTemp. season                           6.04e-09 ***
## PERMTEMPTemp. contract                         1.41e-05 ***
## PERMTEMPTemp. casual                           1.07e-11 ***
## FIRMSIZE.L                                     1.69e-14 ***
## FIRMSIZE.Q                                     0.582668    
## FIRMSIZE.C                                     0.693809    
## FTPTMAINPart-time                               < 2e-16 ***
## UTOTHRS                                        3.31e-07 ***
## EFAMTYPEHWDENC                                 0.716525    
## EFAMTYPEHWDE17                                 0.806441    
## EFAMTYPEHWDE24                                 0.114401    
## EFAMTYPEHWSHNC                                 0.229702    
## EFAMTYPEHWSH17                                 0.099388 .  
## EFAMTYPEHWSH24                                 0.113947    
## EFAMTYPEHWSWNC                                 0.106624    
## EFAMTYPEHWSW17                                 0.009197 ** 
## EFAMTYPEHWSW24                                 0.009464 ** 
## EFAMTYPEHWNENC                                 0.027448 *  
## EFAMTYPEHWNE17                                 0.225868    
## EFAMTYPEHWNE24                                 0.780926    
## EFAMTYPESPE17                                  0.009608 ** 
## EFAMTYPESPE24                                  0.013340 *  
## EFAMTYPESPN17                                  0.611402    
## EFAMTYPESPN24                                  0.226260    
## EFAMTYPEOther                                  0.000186 ***
## LFSSTATEmployed, absent from work              0.086171 .  
## SCHOOLNFull-time student                       0.000685 ***
## SCHOOLNPart-time student                       0.710844    
## SCHOOLNUnknown                                       NA    
## MARSTATCommon-law                              0.539187    
## MARSTATWidowed                                 0.492362    
## MARSTATSeparated                               0.005228 ** 
## MARSTATDivorced                                0.905203    
## MARSTATSingle, NM                              0.219413    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06251 on 17711 degrees of freedom
## Multiple R-squared:  0.5654, Adjusted R-squared:  0.5632 
## F-statistic: 256.1 on 90 and 17711 DF,  p-value: < 2.2e-16
par(mar=c(5.1, 4.1, 4.1, 2.1))
plot(new.model, lwd = 6)

# Residuals Skewness
# Before Transformation:
skewness(model$residuals)
## [1] 1.290823
# After Transformation
skewness(new.model$residuals)
## [1] 0.7025932
# Prediction
prediction <- predict(new.model, interval = "prediction", newdata = test.19fem)
## Warning in predict.lm(new.model, interval = "prediction", newdata = test.19fem):
## prediction from a rank-deficient fit may be misleading
# Errors
errors <- prediction[,"fit"] - (test.19fem$HRLYEARN^(-0.15))
hist(errors)

rmse <- sqrt(sum((errors)^2)/nrow(test.19fem))
mae <- (1/nrow(test.19fem))*sum(abs(errors))
diff.percent <- 100*(abs(errors)/(test.19fem$HRLYEARN^-0.15))
diff.25 <- length(diff.percent[diff.percent<=25])/nrow(test.19fem)
paste("RMSE:", rmse)
## [1] "RMSE: 0.0642458061529313"
paste("MAE:", mae)
## [1] "MAE: 0.0470490092790988"
paste("Percentage of cases with less than 25% error:", diff.25*100)
## [1] "Percentage of cases with less than 25% error: 99.8951507208388"

STEP 5: Gender Proportions by Sector, Industry and Occupation

# write.csv(data.all, file = "Data_All.csv", row.names=FALSE)

Z-Test for Independent Proportions

  • Compare proportion of women in 2009 and in 2019
  • By Sector, Industry, Occupation
# SECTOR #######################################################################
# Private
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$COWMAIN == "Private sector",])
                , nrow(data.all.19fem[data.all.19fem$COWMAIN == "Private sector",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$COWMAIN == "Private sector",])
                , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$COWMAIN == "Private sector",]))
          , alternative = "greater"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$COWMAIN == "Private sector", ]), nrow(data.all.19fem[data.all.19fem$COWMAIN == "Private sector", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$COWMAIN == "Private sector", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$COWMAIN == "Private sector", ]))
## X-squared = 15.371, df = 1, p-value = 4.417e-05
## alternative hypothesis: greater
## 95 percent confidence interval:
##  0.008155203 1.000000000
## sample estimates:
##    prop 1    prop 2 
## 0.4612816 0.4472080
# Public
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$COWMAIN == "Public sector",])
              , nrow(data.all.19fem[data.all.19fem$COWMAIN == "Public sector",]))
        , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$COWMAIN == "Public sector",])
              , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$COWMAIN == "Public sector",]))
        , alternative = "less"
        , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$COWMAIN == "Public sector", ]), nrow(data.all.19fem[data.all.19fem$COWMAIN == "Public sector", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$COWMAIN == "Public sector", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$COWMAIN == "Public sector", ]))
## X-squared = 3.0416, df = 1, p-value = 0.04058
## alternative hypothesis: less
## 95 percent confidence interval:
##  -1.0000000000 -0.0005711443
## sample estimates:
##    prop 1    prop 2 
## 0.6309408 0.6410219
# INDUSTRY ####################################################################
# Construction
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Const",])
                , nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Const",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Const",])
                , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Const",]))
          , alternative = "less"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Const", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Const", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Const", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Const", ]))
## X-squared = 8.7586, df = 1, p-value = 0.001541
## alternative hypothesis: less
## 95 percent confidence interval:
##  -1.000000000 -0.009921907
## sample estimates:
##    prop 1    prop 2 
## 0.1061281 0.1286706
# Forestry, Fishing, Min., Oil & Gas
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Fores",])
                , nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Fores",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Fores",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Fores",]))
          , alternative = "less"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Fores", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Fores", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Fores", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Fores", ]))
## X-squared = 0.58053, df = 1, p-value = 0.2231
## alternative hypothesis: less
## 95 percent confidence interval:
##  -1.00000000  0.01162354
## sample estimates:
##    prop 1    prop 2 
## 0.1507024 0.1613774
# Manufacturing durables
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "ManuD",])
                , nrow(data.all.19fem[data.all.19fem$NAICS_18short == "ManuD",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "ManuD",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "ManuD",]))
          , alternative = "greater"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "ManuD", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "ManuD", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "ManuD", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "ManuD", ]))
## X-squared = 0.11841, df = 1, p-value = 0.3654
## alternative hypothesis: greater
## 95 percent confidence interval:
##  -0.01337678  1.00000000
## sample estimates:
##    prop 1    prop 2 
## 0.1972265 0.1933480
# Utilities
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Utils",])
                , nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Utils",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Utils",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Utils",]))
          , alternative = "less"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Utils", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Utils", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Utils", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Utils", ]))
## X-squared = 0.13892, df = 1, p-value = 0.3547
## alternative hypothesis: less
## 95 percent confidence interval:
##  -1.00000000  0.03201062
## sample estimates:
##    prop 1    prop 2 
## 0.2211690 0.2323232
# Transportation & Warehousing
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Trans",])
                , nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Trans",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Trans",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Trans",]))
          , alternative = "less"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Trans", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Trans", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Trans", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Trans", ]))
## X-squared = 0.0096919, df = 1, p-value = 0.4608
## alternative hypothesis: less
## 95 percent confidence interval:
##  -1.00000000  0.01918169
## sample estimates:
##    prop 1    prop 2 
## 0.2668506 0.2684642
# Wholesale Trade
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Whole",])
                , nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Whole",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Whole",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Whole",]))
          , alternative = "less"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Whole", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Whole", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Whole", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Whole", ]))
## X-squared = 0.75298, df = 1, p-value = 0.1928
## alternative hypothesis: less
## 95 percent confidence interval:
##  -1.0000000  0.0122707
## sample estimates:
##    prop 1    prop 2 
## 0.2888638 0.3031447
# Manufacturing non-durables
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "ManuN",])
                , nrow(data.all.19fem[data.all.19fem$NAICS_18short == "ManuN",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "ManuN",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "ManuN",]))
          , alternative = "greater"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "ManuN", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "ManuN", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "ManuN", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "ManuN", ]))
## X-squared = 0.23825, df = 1, p-value = 0.3127
## alternative hypothesis: greater
## 95 percent confidence interval:
##  -0.01560947  1.00000000
## sample estimates:
##    prop 1    prop 2 
## 0.3618397 0.3548527
# Agriculture
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Agri",])
                , nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Agri",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Agri",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Agri",]))
          , alternative = "less"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Agri", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Agri", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Agri", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Agri", ]))
## X-squared = 3.4405, df = 1, p-value = 0.03181
## alternative hypothesis: less
## 95 percent confidence interval:
##  -1.000000000 -0.005276911
## sample estimates:
##    prop 1    prop 2 
## 0.3077994 0.3554302
# Management, Admin. & Support
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Mngt",])
                , nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Mngt",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Mngt",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Mngt",]))
          , alternative = "greater"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Mngt", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Mngt", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Mngt", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Mngt", ]))
## X-squared = 5.9742, df = 1, p-value = 0.007258
## alternative hypothesis: greater
## 95 percent confidence interval:
##  0.01358235 1.00000000
## sample estimates:
##    prop 1    prop 2 
## 0.4528406 0.4108575
# Prof., Scientific & Technical Services
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "ProSc",])
                , nrow(data.all.19fem[data.all.19fem$NAICS_18short == "ProSc",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "ProSc",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "ProSc",]))
          , alternative = "greater"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "ProSc", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "ProSc", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "ProSc", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "ProSc", ]))
## X-squared = 3.7569, df = 1, p-value = 0.0263
## alternative hypothesis: greater
## 95 percent confidence interval:
##  0.00431574 1.00000000
## sample estimates:
##    prop 1    prop 2 
## 0.4972222 0.4683240
# Information, Culture & Recreation
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Info",])
                , nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Info",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Info",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Info",]))
          , alternative = "greater"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Info", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Info", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Info", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Info", ]))
## X-squared = 1.826, df = 1, p-value = 0.0883
## alternative hypothesis: greater
## 95 percent confidence interval:
##  -0.004620229  1.000000000
## sample estimates:
##    prop 1    prop 2 
## 0.4938215 0.4720129
# Public Administration
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "PubAd",])
                , nrow(data.all.19fem[data.all.19fem$NAICS_18short == "PubAd",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "PubAd",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "PubAd",]))
          , alternative = "less"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "PubAd", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "PubAd", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "PubAd", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "PubAd", ]))
## X-squared = 0.48639, df = 1, p-value = 0.2428
## alternative hypothesis: less
## 95 percent confidence interval:
##  -1.00000000  0.01108627
## sample estimates:
##    prop 1    prop 2 
## 0.4997339 0.5081690
# Other Services
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Other",])
                , nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Other",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Other",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Other",]))
          , alternative = "greater"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Other", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Other", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Other", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Other", ]))
## X-squared = 2.3303, df = 1, p-value = 0.06344
## alternative hypothesis: greater
## 95 percent confidence interval:
##  -0.001929488  1.000000000
## sample estimates:
##    prop 1    prop 2 
## 0.5481518 0.5227273
# Retail Trade
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Rtail",])
                , nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Rtail",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Rtail",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Rtail",]))
          , alternative = "greater"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Rtail", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Rtail", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Rtail", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Rtail", ]))
## X-squared = 18.252, df = 1, p-value = 9.675e-06
## alternative hypothesis: greater
## 95 percent confidence interval:
##  0.0226837 1.0000000
## sample estimates:
##    prop 1    prop 2 
## 0.5812002 0.5441633
# Finance, Insurance, Real Est. & Leas.
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Finan",])
                , nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Finan",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Finan",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Finan",]))
          , alternative = "greater"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Finan", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Finan", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Finan", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Finan", ]))
## X-squared = 18.143, df = 1, p-value = 1.025e-05
## alternative hypothesis: greater
## 95 percent confidence interval:
##  0.03481883 1.00000000
## sample estimates:
##    prop 1    prop 2 
## 0.6524696 0.5953711
# Accommodation & Food Services
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "AcFood",])
                , nrow(data.all.19fem[data.all.19fem$NAICS_18short == "AcFood",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "AcFood",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "AcFood",]))
          , alternative = "greater"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "AcFood", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "AcFood", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "AcFood", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "AcFood", ]))
## X-squared = 1.8235, df = 1, p-value = 0.08845
## alternative hypothesis: greater
## 95 percent confidence interval:
##  -0.003296079  1.000000000
## sample estimates:
##    prop 1    prop 2 
## 0.6379529 0.6225989
# Educational Services
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Educa",])
                , nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Educa",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Educa",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Educa",]))
          , alternative = "less"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Educa", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Educa", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Educa", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Educa", ]))
## X-squared = 7.4584, df = 1, p-value = 0.003157
## alternative hypothesis: less
## 95 percent confidence interval:
##  -1.0000000 -0.0105416
## sample estimates:
##    prop 1    prop 2 
## 0.6808045 0.7075028
# Health Care & Social Assistance
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Health",])
                , nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Health",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Health",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Health",]))
          , alternative = "greater"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Health", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Health", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Health", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Health", ]))
## X-squared = 6.7094, df = 1, p-value = 0.004795
## alternative hypothesis: greater
## 95 percent confidence interval:
##  0.005639885 1.000000000
## sample estimates:
##    prop 1    prop 2 
## 0.8534341 0.8378633
# OCCUPATION ###################################################################
# Trades, transport & equipm. operators
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "Trades",])
                , nrow(data.all.19fem[data.all.19fem$NOC_10short == "Trades",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "Trades",])
                 , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Trades",]))
          , alternative = "less"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "Trades", ]), nrow(data.all.19fem[data.all.19fem$NOC_10short == "Trades", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "Trades", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Trades", ]))
## X-squared = 5.403, df = 1, p-value = 0.01005
## alternative hypothesis: less
## 95 percent confidence interval:
##  -1.000000000 -0.002573939
## sample estimates:
##     prop 1     prop 2 
## 0.06091718 0.06989448
# Natural resources & agriculture
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "NatAgri",])
                , nrow(data.all.19fem[data.all.19fem$NOC_10short == "NatAgri",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "NatAgri",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "NatAgri",]))
          , alternative = "less"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "NatAgri", ]), nrow(data.all.19fem[data.all.19fem$NOC_10short == "NatAgri", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "NatAgri", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "NatAgri", ]))
## X-squared = 8.4056, df = 1, p-value = 0.00187
## alternative hypothesis: less
## 95 percent confidence interval:
##  -1.0000000 -0.0175078
## sample estimates:
##    prop 1    prop 2 
## 0.1682479 0.2092875
# Natural & applied sciences
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "NatASc",])
                , nrow(data.all.19fem[data.all.19fem$NOC_10short == "NatASc",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "NatASc",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "NatASc",]))
          , alternative = "less"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "NatASc", ]), nrow(data.all.19fem[data.all.19fem$NOC_10short == "NatASc", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "NatASc", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "NatASc", ]))
## X-squared = 1.1179, df = 1, p-value = 0.1452
## alternative hypothesis: less
## 95 percent confidence interval:
##  -1.000000000  0.005891185
## sample estimates:
##    prop 1    prop 2 
## 0.2213018 0.2322076
# Manufacturing & utilities
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "ManUtil",])
                , nrow(data.all.19fem[data.all.19fem$NOC_10short == "ManUtil",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "ManUtil",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "ManUtil",]))
          , alternative = "greater"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "ManUtil", ]), nrow(data.all.19fem[data.all.19fem$NOC_10short == "ManUtil", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "ManUtil", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "ManUtil", ]))
## X-squared = 7.1194, df = 1, p-value = 0.003813
## alternative hypothesis: greater
## 95 percent confidence interval:
##  0.01237363 1.00000000
## sample estimates:
##    prop 1    prop 2 
## 0.2831858 0.2506759
# Management
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "Mngt",])
                , nrow(data.all.19fem[data.all.19fem$NOC_10short == "Mngt",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "Mngt",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Mngt",]))
          , alternative = "greater"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "Mngt", ]), nrow(data.all.19fem[data.all.19fem$NOC_10short == "Mngt", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "Mngt", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Mngt", ]))
## X-squared = 0.68827, df = 1, p-value = 0.2034
## alternative hypothesis: greater
## 95 percent confidence interval:
##  -0.01000759  1.00000000
## sample estimates:
##    prop 1    prop 2 
## 0.4310008 0.4205007
# Art, culture, recreation & sport
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "ArtCul",])
                , nrow(data.all.19fem[data.all.19fem$NOC_10short == "ArtCul",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "ArtCul",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "ArtCul",]))
          , alternative = "greater"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "ArtCul", ]), nrow(data.all.19fem[data.all.19fem$NOC_10short == "ArtCul", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "ArtCul", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "ArtCul", ]))
## X-squared = 1.8359, df = 1, p-value = 0.08771
## alternative hypothesis: greater
## 95 percent confidence interval:
##  -0.006708564  1.000000000
## sample estimates:
##   prop 1   prop 2 
## 0.605042 0.572590
# Sales & service
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "Sales",])
                , nrow(data.all.19fem[data.all.19fem$NOC_10short == "Sales",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "Sales",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Sales",]))
          , alternative = "greater"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "Sales", ]), nrow(data.all.19fem[data.all.19fem$NOC_10short == "Sales", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "Sales", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Sales", ]))
## X-squared = 13.65, df = 1, p-value = 0.0001101
## alternative hypothesis: greater
## 95 percent confidence interval:
##  0.01223069 1.00000000
## sample estimates:
##    prop 1    prop 2 
## 0.5964467 0.5743233
# Educ., law, community & gov. services
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "EduLaw",])
                , nrow(data.all.19fem[data.all.19fem$NOC_10short == "EduLaw",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "EduLaw",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "EduLaw",]))
          , alternative = "less"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "EduLaw", ]), nrow(data.all.19fem[data.all.19fem$NOC_10short == "EduLaw", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "EduLaw", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "EduLaw", ]))
## X-squared = 0.28984, df = 1, p-value = 0.2952
## alternative hypothesis: less
## 95 percent confidence interval:
##  -1.000000000  0.009369943
## sample estimates:
##    prop 1    prop 2 
## 0.7124247 0.7171559
# Business, finance & administration
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "BusFin",])
                , nrow(data.all.19fem[data.all.19fem$NOC_10short == "BusFin",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "BusFin",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "BusFin",]))
          , alternative = "greater"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "BusFin", ]), nrow(data.all.19fem[data.all.19fem$NOC_10short == "BusFin", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "BusFin", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "BusFin", ]))
## X-squared = 5.3661, df = 1, p-value = 0.01027
## alternative hypothesis: greater
## 95 percent confidence interval:
##  0.004375397 1.000000000
## sample estimates:
##    prop 1    prop 2 
## 0.7521890 0.7369202
# Health
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "Health",])
                , nrow(data.all.19fem[data.all.19fem$NOC_10short == "Health",]))
          , n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "Health",])
                  , nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Health",]))
          , alternative = "greater"
          , conf.level = 0.95)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "Health", ]), nrow(data.all.19fem[data.all.19fem$NOC_10short == "Health", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "Health", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Health", ]))
## X-squared = 5.5633, df = 1, p-value = 0.00917
## alternative hypothesis: greater
## 95 percent confidence interval:
##  0.005612905 1.000000000
## sample estimates:
##    prop 1    prop 2 
## 0.8661738 0.8474368